From 15daa59274bd6cb276b05d0aaf68ff215291dedf Mon Sep 17 00:00:00 2001
From: sparsh <sparshsingh@berkeley.edu>
Date: Tue, 9 Dec 2025 01:33:05 -0800
Subject: [PATCH 01/13] Buddy Gemmini: IR dumps for matmul (and more configs)

---
 experiments/gemmini/inputs/matmul.mlir        |   7 ++
 .../gemmini/logs/matmul.print-after-all.mlir  | 113 ++++++++++++++++++
 2 files changed, 120 insertions(+)
 create mode 100644 experiments/gemmini/inputs/matmul.mlir
 create mode 100644 experiments/gemmini/logs/matmul.print-after-all.mlir

diff --git a/experiments/gemmini/inputs/matmul.mlir b/experiments/gemmini/inputs/matmul.mlir
new file mode 100644
index 0000000..f6fb3b4
--- /dev/null
+++ b/experiments/gemmini/inputs/matmul.mlir
@@ -0,0 +1,7 @@
+module {
+  func.func @matmul(%A: memref<64x64xf16>, %B: memref<64x64xf16>, %C: memref<64x64xf32>) {
+    linalg.matmul ins(%A, %B : memref<64x64xf16>, memref<64x64xf16>)
+                 outs(%C : memref<64x64xf32>)
+    return
+  }
+}
diff --git a/experiments/gemmini/logs/matmul.print-after-all.mlir b/experiments/gemmini/logs/matmul.print-after-all.mlir
new file mode 100644
index 0000000..6e9a5cc
--- /dev/null
+++ b/experiments/gemmini/logs/matmul.print-after-all.mlir
@@ -0,0 +1,113 @@
+// -----// IR Dump After (anonymous namespace)::LowerLinalgToGemminiPass (convert-linalg-to-gemmini) //----- //
+module {
+  func.func @matmul(%arg0: memref<64x64xf16>, %arg1: memref<64x64xf16>, %arg2: memref<64x64xf32>) {
+    %alloc = memref.alloc() : memref<64x64xi32>
+    %c0_i32 = arith.constant 0 : i32
+    linalg.fill ins(%c0_i32 : i32) outs(%alloc : memref<64x64xi32>)
+    gemmini.tile_matmul %arg0 %arg1 %arg2 %alloc : memref<64x64xf16> memref<64x64xf16> memref<64x64xf32> memref<64x64xi32>
+    memref.dealloc %alloc : memref<64x64xi32>
+    return
+  }
+}
+
+
+// -----// IR Dump After (anonymous namespace)::LowerGemminiToLLVMPass (lower-gemmini) //----- //
+module {
+  llvm.func @free(!llvm.ptr)
+  llvm.func @malloc(i64) -> !llvm.ptr
+  llvm.func @matmul(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: !llvm.ptr, %arg8: !llvm.ptr, %arg9: i64, %arg10: i64, %arg11: i64, %arg12: i64, %arg13: i64, %arg14: !llvm.ptr, %arg15: !llvm.ptr, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64) {
+    %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+    %1 = llvm.insertvalue %arg14, %0[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %2 = llvm.insertvalue %arg15, %1[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %3 = llvm.insertvalue %arg16, %2[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %4 = llvm.insertvalue %arg17, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %5 = llvm.insertvalue %arg19, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %6 = llvm.insertvalue %arg18, %5[3, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %7 = llvm.insertvalue %arg20, %6[4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %8 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+    %9 = llvm.insertvalue %arg7, %8[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %10 = llvm.insertvalue %arg8, %9[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %11 = llvm.insertvalue %arg9, %10[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %12 = llvm.insertvalue %arg10, %11[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %13 = llvm.insertvalue %arg12, %12[4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %14 = llvm.insertvalue %arg11, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %15 = llvm.insertvalue %arg13, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %16 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+    %17 = llvm.insertvalue %arg0, %16[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %18 = llvm.insertvalue %arg1, %17[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %19 = llvm.insertvalue %arg2, %18[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %20 = llvm.insertvalue %arg3, %19[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %21 = llvm.insertvalue %arg5, %20[4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %22 = llvm.insertvalue %arg4, %21[3, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %23 = llvm.insertvalue %arg6, %22[4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %24 = llvm.mlir.constant(64 : index) : i64
+    %25 = llvm.mlir.constant(64 : index) : i64
+    %26 = llvm.mlir.constant(1 : index) : i64
+    %27 = llvm.mlir.constant(4096 : index) : i64
+    %28 = llvm.mlir.zero : !llvm.ptr
+    %29 = llvm.getelementptr %28[%27] : (!llvm.ptr, i64) -> !llvm.ptr, i32
+    %30 = llvm.ptrtoint %29 : !llvm.ptr to i64
+    %31 = llvm.call @malloc(%30) : (i64) -> !llvm.ptr
+    %32 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+    %33 = llvm.insertvalue %31, %32[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %34 = llvm.insertvalue %31, %33[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %35 = llvm.mlir.constant(0 : index) : i64
+    %36 = llvm.insertvalue %35, %34[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %37 = llvm.insertvalue %24, %36[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %38 = llvm.insertvalue %25, %37[3, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %39 = llvm.insertvalue %25, %38[4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %40 = llvm.insertvalue %26, %39[4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %41 = builtin.unrealized_conversion_cast %40 : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> to memref<64x64xi32>
+    %42 = llvm.mlir.constant(0 : i32) : i32
+    linalg.fill ins(%42 : i32) outs(%41 : memref<64x64xi32>)
+    %43 = llvm.extractvalue %23[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %44 = llvm.ptrtoint %43 : !llvm.ptr to i64
+    %45 = llvm.extractvalue %15[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %46 = llvm.ptrtoint %45 : !llvm.ptr to i64
+    %47 = llvm.extractvalue %7[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %48 = llvm.ptrtoint %47 : !llvm.ptr to i64
+    %49 = llvm.extractvalue %40[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %50 = llvm.ptrtoint %49 : !llvm.ptr to i64
+    %51 = llvm.mlir.constant(4575657221408489476 : i64) : i64
+    %52 = llvm.mlir.constant(281474976710656 : i64) : i64
+    "gemmini.intr.config_ex"(%51, %52) : (i64, i64) -> ()
+    %53 = llvm.mlir.constant(64 : i64) : i64
+    %54 = llvm.mlir.constant(2 : i64) : i64
+    %55 = llvm.mlir.constant(4575657221408424000 : i64) : i64
+    "gemmini.intr.config_st"(%54, %55) : (i64, i64) -> ()
+    %56 = llvm.mlir.constant(64 : i64) : i64
+    %57 = llvm.mlir.constant(4575657221409472769 : i64) : i64
+    "gemmini.intr.config_ld"(%57, %56) : (i64, i64) -> ()
+    %58 = llvm.mlir.constant(64 : i64) : i64
+    %59 = llvm.mlir.constant(4575657221409472777 : i64) : i64
+    "gemmini.intr.config_ld"(%59, %58) : (i64, i64) -> ()
+    %60 = llvm.mlir.constant(256 : i64) : i64
+    %61 = llvm.mlir.constant(4575657221409472785 : i64) : i64
+    "gemmini.intr.config_ld"(%61, %60) : (i64, i64) -> ()
+    %62 = llvm.mlir.constant(0 : i64) : i64
+    %63 = llvm.mlir.constant(0 : i64) : i64
+    %64 = llvm.mlir.constant(0 : i64) : i64
+    %65 = llvm.mlir.constant(0 : i64) : i64
+    %66 = llvm.mlir.constant(0 : i64) : i64
+    %67 = llvm.mlir.constant(17180131332 : i64) : i64
+    "gemmini.intr.loop_ws_config_bounds"(%66, %67) : (i64, i64) -> ()
+    "gemmini.intr.loop_ws_config_addrs_ab"(%44, %46) : (i64, i64) -> ()
+    "gemmini.intr.loop_ws_config_addrs_dc"(%50, %48) : (i64, i64) -> ()
+    %68 = llvm.mlir.constant(64 : i64) : i64
+    %69 = llvm.mlir.constant(64 : i64) : i64
+    "gemmini.intr.loop_ws_config_strides_ab"(%68, %69) : (i64, i64) -> ()
+    %70 = llvm.mlir.constant(64 : i64) : i64
+    %71 = llvm.mlir.constant(64 : i64) : i64
+    "gemmini.intr.loop_ws_config_strides_dc"(%70, %71) : (i64, i64) -> ()
+    %72 = llvm.mlir.constant(1 : i64) : i64
+    %73 = llvm.mlir.constant(0 : i64) : i64
+    "gemmini.intr.loop_ws"(%72, %73) : (i64, i64) -> ()
+    %74 = llvm.mlir.constant(0 : i64) : i64
+    "gemmini.intr.flush"(%74, %74) : (i64, i64) -> ()
+    %75 = llvm.extractvalue %40[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    llvm.call @free(%75) : (!llvm.ptr) -> ()
+    llvm.return
+  }
+}
+
+

From 5fef6bad9afabf4cec78f7d388982322c42daefc Mon Sep 17 00:00:00 2001
From: sparsh <sparshsingh@berkeley.edu>
Date: Tue, 9 Dec 2025 01:38:35 -0800
Subject: [PATCH 02/13] Buddy Gemmini: add batch_matmul + conv IR dumps

---
 experiments/gemmini/inputs/batch_matmul.mlir  |  30 +
 .../gemmini/inputs/conv_2d_nchw_fchw_f32.mlir |  51 ++
 .../batch_matmul.mlir.print-after-all.mlir    | 411 ++++++++++++
 ...2d_nchw_fchw_f32.mlir.print-after-all.mlir | 594 ++++++++++++++++++
 4 files changed, 1086 insertions(+)
 create mode 100644 experiments/gemmini/inputs/batch_matmul.mlir
 create mode 100644 experiments/gemmini/inputs/conv_2d_nchw_fchw_f32.mlir
 create mode 100644 experiments/gemmini/logs/batch_matmul.mlir.print-after-all.mlir
 create mode 100644 experiments/gemmini/logs/conv_2d_nchw_fchw_f32.mlir.print-after-all.mlir

diff --git a/experiments/gemmini/inputs/batch_matmul.mlir b/experiments/gemmini/inputs/batch_matmul.mlir
new file mode 100644
index 0000000..1cf5347
--- /dev/null
+++ b/experiments/gemmini/inputs/batch_matmul.mlir
@@ -0,0 +1,30 @@
+// RUN: buddy-opt %s \
+// RUN:     --convert-linalg-to-gemmini | \
+// RUN: FileCheck %s
+
+func.func @main() -> i8 {
+  %0 = arith.constant 0 : i8 
+  %1 = arith.constant 1 : i8
+  %2 = arith.constant 2 : i8 
+  %input0 = memref.alloc() : memref<3x3x3xi8> 
+  %input1 = memref.alloc() : memref<3x3x3xi8> 
+  %output = memref.alloc() : memref<3x3x3xi8>  
+  linalg.fill
+    ins(%1 : i8)
+  outs(%input0 : memref<3x3x3xi8>)
+  linalg.fill
+    ins(%2 : i8)
+  outs(%input1 : memref<3x3x3xi8>)
+  // CHECK: gemmini.tile_matmul %subview %subview_2 %subview_3 %alloc_4 : 
+  // CHECK-SAME: memref<3x3xi8, strided<[3, 1]>> memref<3x3xi8, strided<[3, 1]>> memref<3x3xi8, strided<[3, 1]>> memref<3x3xi32>
+  // CHECK: gemmini.tile_matmul %subview_5 %subview_6 %subview_7 %alloc_8 : 
+  // CHECK-SAME: memref<3x3xi8, strided<[3, 1], offset: 9>> memref<3x3xi8, strided<[3, 1], offset: 9>> memref<3x3xi8, strided<[3, 1], offset: 9>> memref<3x3xi32>
+  // CHECK: gemmini.tile_matmul %subview_10 %subview_11 %subview_12 %alloc_13 : 
+  // CHECK-SAME: memref<3x3xi8, strided<[3, 1], offset: 18>> memref<3x3xi8, strided<[3, 1], offset: 18>> memref<3x3xi8, strided<[3, 1], offset: 18>> memref<3x3xi32>
+  linalg.batch_matmul
+    ins(%input0, %input1: memref<3x3x3xi8>, memref<3x3x3xi8>)
+  outs(%output : memref<3x3x3xi8>)
+  gemmini.print %output : memref<3x3x3xi8>
+  memref.dealloc %output : memref<3x3x3xi8> 
+  return %0 : i8
+}
diff --git a/experiments/gemmini/inputs/conv_2d_nchw_fchw_f32.mlir b/experiments/gemmini/inputs/conv_2d_nchw_fchw_f32.mlir
new file mode 100644
index 0000000..1091167
--- /dev/null
+++ b/experiments/gemmini/inputs/conv_2d_nchw_fchw_f32.mlir
@@ -0,0 +1,51 @@
+// RUN: buddy-opt %s \
+// RUN:     --convert-linalg-to-gemmini="acc_t=f32" | \
+// RUN: FileCheck %s
+
+memref.global "private" @input : memref<2x2x5x5xf32> = dense<[[[[1., 0., -1., 0., 1.],
+                                                               [1., 0., -1., 0., 1.],
+                                                               [1., 0., -1., 0., 1.],
+                                                               [1., 0., -1., 0., 1.],
+                                                               [-1., 0., 1., 0., -1.]],
+                                                              [[-1., 0., 1., 0., -1.],
+                                                               [-1., 0., 1., 0., -1.],
+                                                               [-1., 0., 1., 0., -1.],
+                                                               [-1., 0., 1., 0., -1.],
+                                                               [-1., 0., 1., 0., -1.]]],
+                                                             [[[1., 0., 2., 0., 1.],
+                                                               [1., 0., 2., 0., 1.],
+                                                               [1., 0., 2., 0., 1.],
+                                                               [1., 0., 2., 0., 1.],
+                                                               [-1., 0., 2., 0., -1.]],
+                                                              [[-1., 0., 2., 0., -1.],
+                                                               [-1., 0., 2., 0., -1.],
+                                                               [-1., 0., 2., 0., -1.],
+                                                               [-1., 0., 2., 0., -1.],
+                                                               [-1., 0., 2., 0., -1.]]]]>
+
+memref.global "private" @weight : memref<2x2x3x3xf32> = dense<[[[[1., 2., 3.],
+                                                                [3., 2., 1.],
+                                                                [1., 2., 3.]],
+                                                               [[3., 2., 1.],
+                                                                [1., 2., 3.],
+                                                                [3., 2., 1.]]],
+                                                                [[[1., 2., 3.],
+                                                                [3., 2., 1.],
+                                                                [1., 2., 3.]],
+                                                               [[3., 2., 1.],
+                                                                [1., 2., 3.],
+                                                                [3., 2., 1.]]]]>
+
+func.func @main() -> i8 {
+  %0 = arith.constant 0 : i8
+  %mem0 = memref.get_global @input  : memref<2x2x5x5xf32> 
+  %mem1 = memref.get_global @weight : memref<2x2x3x3xf32>
+  %mem2 = memref.alloc() : memref<2x2x3x3xf32> 
+  // CHECK: gemmini.tile_conv %alloc_{{[0-9]+}} %alloc_{{[0-9]+}} %alloc_{{[0-9]+}} %alloc_{{[0-9]+}} %{{.+}} %{{.+}} : 
+  // CHECK-SAME: memref<2x5x5x2xf32> memref<18x2xf32> memref<2xf32> memref<18x2xf32> i64 i64
+  linalg.conv_2d_nchw_fchw 
+    ins (%mem0, %mem1 : memref<2x2x5x5xf32>, memref<2x2x3x3xf32>)
+  outs(%mem2 : memref<2x2x3x3xf32>)
+  gemmini.print %mem2 : memref<2x2x3x3xf32>
+  return %0 : i8
+}
diff --git a/experiments/gemmini/logs/batch_matmul.mlir.print-after-all.mlir b/experiments/gemmini/logs/batch_matmul.mlir.print-after-all.mlir
new file mode 100644
index 0000000..8f6ed5e
--- /dev/null
+++ b/experiments/gemmini/logs/batch_matmul.mlir.print-after-all.mlir
@@ -0,0 +1,411 @@
+// -----// IR Dump After (anonymous namespace)::LowerLinalgToGemminiPass (convert-linalg-to-gemmini) //----- //
+module {
+  func.func @main() -> i8 {
+    %c0_i8 = arith.constant 0 : i8
+    %c1_i8 = arith.constant 1 : i8
+    %c2_i8 = arith.constant 2 : i8
+    %alloc = memref.alloc() : memref<3x3x3xi8>
+    %alloc_0 = memref.alloc() : memref<3x3x3xi8>
+    %alloc_1 = memref.alloc() : memref<3x3x3xi8>
+    linalg.fill ins(%c1_i8 : i8) outs(%alloc : memref<3x3x3xi8>)
+    linalg.fill ins(%c2_i8 : i8) outs(%alloc_0 : memref<3x3x3xi8>)
+    %subview = memref.subview %alloc[0, 0, 0] [1, 3, 3] [1, 1, 1] : memref<3x3x3xi8> to memref<3x3xi8, strided<[3, 1]>>
+    %subview_2 = memref.subview %alloc_0[0, 0, 0] [1, 3, 3] [1, 1, 1] : memref<3x3x3xi8> to memref<3x3xi8, strided<[3, 1]>>
+    %subview_3 = memref.subview %alloc_1[0, 0, 0] [1, 3, 3] [1, 1, 1] : memref<3x3x3xi8> to memref<3x3xi8, strided<[3, 1]>>
+    %alloc_4 = memref.alloc() : memref<3x3xi32>
+    %c0_i32 = arith.constant 0 : i32
+    linalg.fill ins(%c0_i32 : i32) outs(%alloc_4 : memref<3x3xi32>)
+    gemmini.tile_matmul %subview %subview_2 %subview_3 %alloc_4 : memref<3x3xi8, strided<[3, 1]>> memref<3x3xi8, strided<[3, 1]>> memref<3x3xi8, strided<[3, 1]>> memref<3x3xi32>
+    memref.dealloc %alloc_4 : memref<3x3xi32>
+    %subview_5 = memref.subview %alloc[1, 0, 0] [1, 3, 3] [1, 1, 1] : memref<3x3x3xi8> to memref<3x3xi8, strided<[3, 1], offset: 9>>
+    %subview_6 = memref.subview %alloc_0[1, 0, 0] [1, 3, 3] [1, 1, 1] : memref<3x3x3xi8> to memref<3x3xi8, strided<[3, 1], offset: 9>>
+    %subview_7 = memref.subview %alloc_1[1, 0, 0] [1, 3, 3] [1, 1, 1] : memref<3x3x3xi8> to memref<3x3xi8, strided<[3, 1], offset: 9>>
+    %alloc_8 = memref.alloc() : memref<3x3xi32>
+    %c0_i32_9 = arith.constant 0 : i32
+    linalg.fill ins(%c0_i32_9 : i32) outs(%alloc_8 : memref<3x3xi32>)
+    gemmini.tile_matmul %subview_5 %subview_6 %subview_7 %alloc_8 : memref<3x3xi8, strided<[3, 1], offset: 9>> memref<3x3xi8, strided<[3, 1], offset: 9>> memref<3x3xi8, strided<[3, 1], offset: 9>> memref<3x3xi32>
+    memref.dealloc %alloc_8 : memref<3x3xi32>
+    %subview_10 = memref.subview %alloc[2, 0, 0] [1, 3, 3] [1, 1, 1] : memref<3x3x3xi8> to memref<3x3xi8, strided<[3, 1], offset: 18>>
+    %subview_11 = memref.subview %alloc_0[2, 0, 0] [1, 3, 3] [1, 1, 1] : memref<3x3x3xi8> to memref<3x3xi8, strided<[3, 1], offset: 18>>
+    %subview_12 = memref.subview %alloc_1[2, 0, 0] [1, 3, 3] [1, 1, 1] : memref<3x3x3xi8> to memref<3x3xi8, strided<[3, 1], offset: 18>>
+    %alloc_13 = memref.alloc() : memref<3x3xi32>
+    %c0_i32_14 = arith.constant 0 : i32
+    linalg.fill ins(%c0_i32_14 : i32) outs(%alloc_13 : memref<3x3xi32>)
+    gemmini.tile_matmul %subview_10 %subview_11 %subview_12 %alloc_13 : memref<3x3xi8, strided<[3, 1], offset: 18>> memref<3x3xi8, strided<[3, 1], offset: 18>> memref<3x3xi8, strided<[3, 1], offset: 18>> memref<3x3xi32>
+    memref.dealloc %alloc_13 : memref<3x3xi32>
+    gemmini.print %alloc_1 : memref<3x3x3xi8>
+    memref.dealloc %alloc_1 : memref<3x3x3xi8>
+    return %c0_i8 : i8
+  }
+}
+
+
+// -----// IR Dump After (anonymous namespace)::LowerGemminiToLLVMPass (lower-gemmini) //----- //
+module {
+  llvm.mlir.global internal constant @nl("\0A\00") {addr_space = 0 : i32}
+  llvm.mlir.global internal constant @frmt_spec("%d \00") {addr_space = 0 : i32}
+  llvm.func @printf(!llvm.ptr, ...) -> i32
+  llvm.func @free(!llvm.ptr)
+  llvm.func @malloc(i64) -> !llvm.ptr
+  llvm.func @main() -> i8 {
+    %0 = llvm.mlir.constant(0 : i8) : i8
+    %1 = llvm.mlir.constant(1 : i8) : i8
+    %2 = llvm.mlir.constant(2 : i8) : i8
+    %3 = llvm.mlir.constant(3 : index) : i64
+    %4 = llvm.mlir.constant(3 : index) : i64
+    %5 = llvm.mlir.constant(3 : index) : i64
+    %6 = llvm.mlir.constant(1 : index) : i64
+    %7 = llvm.mlir.constant(9 : index) : i64
+    %8 = llvm.mlir.constant(27 : index) : i64
+    %9 = llvm.mlir.zero : !llvm.ptr
+    %10 = llvm.getelementptr %9[%8] : (!llvm.ptr, i64) -> !llvm.ptr, i8
+    %11 = llvm.ptrtoint %10 : !llvm.ptr to i64
+    %12 = llvm.call @malloc(%11) : (i64) -> !llvm.ptr
+    %13 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)>
+    %14 = llvm.insertvalue %12, %13[0] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> 
+    %15 = llvm.insertvalue %12, %14[1] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> 
+    %16 = llvm.mlir.constant(0 : index) : i64
+    %17 = llvm.insertvalue %16, %15[2] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> 
+    %18 = llvm.insertvalue %3, %17[3, 0] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> 
+    %19 = llvm.insertvalue %4, %18[3, 1] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> 
+    %20 = llvm.insertvalue %5, %19[3, 2] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> 
+    %21 = llvm.insertvalue %7, %20[4, 0] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> 
+    %22 = llvm.insertvalue %5, %21[4, 1] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> 
+    %23 = llvm.insertvalue %6, %22[4, 2] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> 
+    %24 = builtin.unrealized_conversion_cast %23 : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> to memref<3x3x3xi8>
+    %25 = llvm.mlir.constant(3 : index) : i64
+    %26 = llvm.mlir.constant(3 : index) : i64
+    %27 = llvm.mlir.constant(3 : index) : i64
+    %28 = llvm.mlir.constant(1 : index) : i64
+    %29 = llvm.mlir.constant(9 : index) : i64
+    %30 = llvm.mlir.constant(27 : index) : i64
+    %31 = llvm.mlir.zero : !llvm.ptr
+    %32 = llvm.getelementptr %31[%30] : (!llvm.ptr, i64) -> !llvm.ptr, i8
+    %33 = llvm.ptrtoint %32 : !llvm.ptr to i64
+    %34 = llvm.call @malloc(%33) : (i64) -> !llvm.ptr
+    %35 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)>
+    %36 = llvm.insertvalue %34, %35[0] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> 
+    %37 = llvm.insertvalue %34, %36[1] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> 
+    %38 = llvm.mlir.constant(0 : index) : i64
+    %39 = llvm.insertvalue %38, %37[2] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> 
+    %40 = llvm.insertvalue %25, %39[3, 0] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> 
+    %41 = llvm.insertvalue %26, %40[3, 1] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> 
+    %42 = llvm.insertvalue %27, %41[3, 2] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> 
+    %43 = llvm.insertvalue %29, %42[4, 0] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> 
+    %44 = llvm.insertvalue %27, %43[4, 1] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> 
+    %45 = llvm.insertvalue %28, %44[4, 2] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> 
+    %46 = builtin.unrealized_conversion_cast %45 : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> to memref<3x3x3xi8>
+    %47 = llvm.mlir.constant(3 : index) : i64
+    %48 = llvm.mlir.constant(3 : index) : i64
+    %49 = llvm.mlir.constant(3 : index) : i64
+    %50 = llvm.mlir.constant(1 : index) : i64
+    %51 = llvm.mlir.constant(9 : index) : i64
+    %52 = llvm.mlir.constant(27 : index) : i64
+    %53 = llvm.mlir.zero : !llvm.ptr
+    %54 = llvm.getelementptr %53[%52] : (!llvm.ptr, i64) -> !llvm.ptr, i8
+    %55 = llvm.ptrtoint %54 : !llvm.ptr to i64
+    %56 = llvm.call @malloc(%55) : (i64) -> !llvm.ptr
+    %57 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)>
+    %58 = llvm.insertvalue %56, %57[0] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> 
+    %59 = llvm.insertvalue %56, %58[1] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> 
+    %60 = llvm.mlir.constant(0 : index) : i64
+    %61 = llvm.insertvalue %60, %59[2] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> 
+    %62 = llvm.insertvalue %47, %61[3, 0] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> 
+    %63 = llvm.insertvalue %48, %62[3, 1] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> 
+    %64 = llvm.insertvalue %49, %63[3, 2] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> 
+    %65 = llvm.insertvalue %51, %64[4, 0] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> 
+    %66 = llvm.insertvalue %49, %65[4, 1] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> 
+    %67 = llvm.insertvalue %50, %66[4, 2] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> 
+    %68 = builtin.unrealized_conversion_cast %67 : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> to memref<3x3x3xi8>
+    linalg.fill ins(%1 : i8) outs(%24 : memref<3x3x3xi8>)
+    linalg.fill ins(%2 : i8) outs(%46 : memref<3x3x3xi8>)
+    %subview = memref.subview %24[0, 0, 0] [1, 3, 3] [1, 1, 1] : memref<3x3x3xi8> to memref<3x3xi8, strided<[3, 1]>>
+    %69 = builtin.unrealized_conversion_cast %subview : memref<3x3xi8, strided<[3, 1]>> to !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+    %subview_0 = memref.subview %46[0, 0, 0] [1, 3, 3] [1, 1, 1] : memref<3x3x3xi8> to memref<3x3xi8, strided<[3, 1]>>
+    %70 = builtin.unrealized_conversion_cast %subview_0 : memref<3x3xi8, strided<[3, 1]>> to !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+    %subview_1 = memref.subview %68[0, 0, 0] [1, 3, 3] [1, 1, 1] : memref<3x3x3xi8> to memref<3x3xi8, strided<[3, 1]>>
+    %71 = builtin.unrealized_conversion_cast %subview_1 : memref<3x3xi8, strided<[3, 1]>> to !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+    %72 = llvm.mlir.constant(3 : index) : i64
+    %73 = llvm.mlir.constant(3 : index) : i64
+    %74 = llvm.mlir.constant(1 : index) : i64
+    %75 = llvm.mlir.constant(9 : index) : i64
+    %76 = llvm.mlir.zero : !llvm.ptr
+    %77 = llvm.getelementptr %76[%75] : (!llvm.ptr, i64) -> !llvm.ptr, i32
+    %78 = llvm.ptrtoint %77 : !llvm.ptr to i64
+    %79 = llvm.call @malloc(%78) : (i64) -> !llvm.ptr
+    %80 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+    %81 = llvm.insertvalue %79, %80[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %82 = llvm.insertvalue %79, %81[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %83 = llvm.mlir.constant(0 : index) : i64
+    %84 = llvm.insertvalue %83, %82[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %85 = llvm.insertvalue %72, %84[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %86 = llvm.insertvalue %73, %85[3, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %87 = llvm.insertvalue %73, %86[4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %88 = llvm.insertvalue %74, %87[4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %89 = builtin.unrealized_conversion_cast %88 : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> to memref<3x3xi32>
+    %90 = llvm.mlir.constant(0 : i32) : i32
+    linalg.fill ins(%90 : i32) outs(%89 : memref<3x3xi32>)
+    %91 = llvm.extractvalue %69[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %92 = llvm.ptrtoint %91 : !llvm.ptr to i64
+    %93 = llvm.mlir.constant(0 : index) : i64
+    %94 = llvm.extractvalue %70[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %95 = llvm.ptrtoint %94 : !llvm.ptr to i64
+    %96 = llvm.mlir.constant(0 : index) : i64
+    %97 = llvm.extractvalue %71[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %98 = llvm.ptrtoint %97 : !llvm.ptr to i64
+    %99 = llvm.mlir.constant(0 : index) : i64
+    %100 = llvm.extractvalue %88[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %101 = llvm.ptrtoint %100 : !llvm.ptr to i64
+    %102 = llvm.mlir.constant(4575657221408489476 : i64) : i64
+    %103 = llvm.mlir.constant(281474976710656 : i64) : i64
+    "gemmini.intr.config_ex"(%102, %103) : (i64, i64) -> ()
+    %104 = llvm.mlir.constant(3 : i64) : i64
+    %105 = llvm.mlir.constant(2 : i64) : i64
+    %106 = llvm.mlir.constant(4575657221408423939 : i64) : i64
+    "gemmini.intr.config_st"(%105, %106) : (i64, i64) -> ()
+    %107 = llvm.mlir.constant(3 : i64) : i64
+    %108 = llvm.mlir.constant(4575657221409472769 : i64) : i64
+    "gemmini.intr.config_ld"(%108, %107) : (i64, i64) -> ()
+    %109 = llvm.mlir.constant(3 : i64) : i64
+    %110 = llvm.mlir.constant(4575657221409472777 : i64) : i64
+    "gemmini.intr.config_ld"(%110, %109) : (i64, i64) -> ()
+    %111 = llvm.mlir.constant(12 : i64) : i64
+    %112 = llvm.mlir.constant(4575657221409472785 : i64) : i64
+    "gemmini.intr.config_ld"(%112, %111) : (i64, i64) -> ()
+    %113 = llvm.mlir.constant(0 : i64) : i64
+    %114 = llvm.mlir.constant(0 : i64) : i64
+    %115 = llvm.mlir.constant(0 : i64) : i64
+    %116 = llvm.mlir.constant(0 : i64) : i64
+    %117 = llvm.mlir.constant(55835426829 : i64) : i64
+    %118 = llvm.mlir.constant(4295032833 : i64) : i64
+    "gemmini.intr.loop_ws_config_bounds"(%117, %118) : (i64, i64) -> ()
+    "gemmini.intr.loop_ws_config_addrs_ab"(%92, %95) : (i64, i64) -> ()
+    "gemmini.intr.loop_ws_config_addrs_dc"(%101, %98) : (i64, i64) -> ()
+    %119 = llvm.mlir.constant(3 : i64) : i64
+    %120 = llvm.mlir.constant(3 : i64) : i64
+    "gemmini.intr.loop_ws_config_strides_ab"(%119, %120) : (i64, i64) -> ()
+    %121 = llvm.mlir.constant(3 : i64) : i64
+    %122 = llvm.mlir.constant(3 : i64) : i64
+    "gemmini.intr.loop_ws_config_strides_dc"(%121, %122) : (i64, i64) -> ()
+    %123 = llvm.mlir.constant(1 : i64) : i64
+    %124 = llvm.mlir.constant(0 : i64) : i64
+    "gemmini.intr.loop_ws"(%123, %124) : (i64, i64) -> ()
+    %125 = llvm.mlir.constant(0 : i64) : i64
+    "gemmini.intr.flush"(%125, %125) : (i64, i64) -> ()
+    %126 = llvm.extractvalue %88[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    llvm.call @free(%126) : (!llvm.ptr) -> ()
+    %subview_2 = memref.subview %24[1, 0, 0] [1, 3, 3] [1, 1, 1] : memref<3x3x3xi8> to memref<3x3xi8, strided<[3, 1], offset: 9>>
+    %127 = builtin.unrealized_conversion_cast %subview_2 : memref<3x3xi8, strided<[3, 1], offset: 9>> to !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+    %subview_3 = memref.subview %46[1, 0, 0] [1, 3, 3] [1, 1, 1] : memref<3x3x3xi8> to memref<3x3xi8, strided<[3, 1], offset: 9>>
+    %128 = builtin.unrealized_conversion_cast %subview_3 : memref<3x3xi8, strided<[3, 1], offset: 9>> to !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+    %subview_4 = memref.subview %68[1, 0, 0] [1, 3, 3] [1, 1, 1] : memref<3x3x3xi8> to memref<3x3xi8, strided<[3, 1], offset: 9>>
+    %129 = builtin.unrealized_conversion_cast %subview_4 : memref<3x3xi8, strided<[3, 1], offset: 9>> to !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+    %130 = llvm.mlir.constant(3 : index) : i64
+    %131 = llvm.mlir.constant(3 : index) : i64
+    %132 = llvm.mlir.constant(1 : index) : i64
+    %133 = llvm.mlir.constant(9 : index) : i64
+    %134 = llvm.mlir.zero : !llvm.ptr
+    %135 = llvm.getelementptr %134[%133] : (!llvm.ptr, i64) -> !llvm.ptr, i32
+    %136 = llvm.ptrtoint %135 : !llvm.ptr to i64
+    %137 = llvm.call @malloc(%136) : (i64) -> !llvm.ptr
+    %138 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+    %139 = llvm.insertvalue %137, %138[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %140 = llvm.insertvalue %137, %139[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %141 = llvm.mlir.constant(0 : index) : i64
+    %142 = llvm.insertvalue %141, %140[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %143 = llvm.insertvalue %130, %142[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %144 = llvm.insertvalue %131, %143[3, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %145 = llvm.insertvalue %131, %144[4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %146 = llvm.insertvalue %132, %145[4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %147 = builtin.unrealized_conversion_cast %146 : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> to memref<3x3xi32>
+    %148 = llvm.mlir.constant(0 : i32) : i32
+    linalg.fill ins(%148 : i32) outs(%147 : memref<3x3xi32>)
+    %149 = llvm.extractvalue %127[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %150 = llvm.ptrtoint %149 : !llvm.ptr to i64
+    %151 = llvm.mlir.constant(9 : index) : i64
+    %152 = llvm.add %150, %151 : i64
+    %153 = llvm.extractvalue %128[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %154 = llvm.ptrtoint %153 : !llvm.ptr to i64
+    %155 = llvm.mlir.constant(9 : index) : i64
+    %156 = llvm.add %154, %155 : i64
+    %157 = llvm.extractvalue %129[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %158 = llvm.ptrtoint %157 : !llvm.ptr to i64
+    %159 = llvm.mlir.constant(9 : index) : i64
+    %160 = llvm.add %158, %159 : i64
+    %161 = llvm.extractvalue %146[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %162 = llvm.ptrtoint %161 : !llvm.ptr to i64
+    %163 = llvm.mlir.constant(4575657221408489476 : i64) : i64
+    %164 = llvm.mlir.constant(281474976710656 : i64) : i64
+    "gemmini.intr.config_ex"(%163, %164) : (i64, i64) -> ()
+    %165 = llvm.mlir.constant(3 : i64) : i64
+    %166 = llvm.mlir.constant(2 : i64) : i64
+    %167 = llvm.mlir.constant(4575657221408423939 : i64) : i64
+    "gemmini.intr.config_st"(%166, %167) : (i64, i64) -> ()
+    %168 = llvm.mlir.constant(3 : i64) : i64
+    %169 = llvm.mlir.constant(4575657221409472769 : i64) : i64
+    "gemmini.intr.config_ld"(%169, %168) : (i64, i64) -> ()
+    %170 = llvm.mlir.constant(3 : i64) : i64
+    %171 = llvm.mlir.constant(4575657221409472777 : i64) : i64
+    "gemmini.intr.config_ld"(%171, %170) : (i64, i64) -> ()
+    %172 = llvm.mlir.constant(12 : i64) : i64
+    %173 = llvm.mlir.constant(4575657221409472785 : i64) : i64
+    "gemmini.intr.config_ld"(%173, %172) : (i64, i64) -> ()
+    %174 = llvm.mlir.constant(0 : i64) : i64
+    %175 = llvm.mlir.constant(0 : i64) : i64
+    %176 = llvm.mlir.constant(0 : i64) : i64
+    %177 = llvm.mlir.constant(0 : i64) : i64
+    %178 = llvm.mlir.constant(55835426829 : i64) : i64
+    %179 = llvm.mlir.constant(4295032833 : i64) : i64
+    "gemmini.intr.loop_ws_config_bounds"(%178, %179) : (i64, i64) -> ()
+    "gemmini.intr.loop_ws_config_addrs_ab"(%152, %156) : (i64, i64) -> ()
+    "gemmini.intr.loop_ws_config_addrs_dc"(%162, %160) : (i64, i64) -> ()
+    %180 = llvm.mlir.constant(3 : i64) : i64
+    %181 = llvm.mlir.constant(3 : i64) : i64
+    "gemmini.intr.loop_ws_config_strides_ab"(%180, %181) : (i64, i64) -> ()
+    %182 = llvm.mlir.constant(3 : i64) : i64
+    %183 = llvm.mlir.constant(3 : i64) : i64
+    "gemmini.intr.loop_ws_config_strides_dc"(%182, %183) : (i64, i64) -> ()
+    %184 = llvm.mlir.constant(1 : i64) : i64
+    %185 = llvm.mlir.constant(0 : i64) : i64
+    "gemmini.intr.loop_ws"(%184, %185) : (i64, i64) -> ()
+    %186 = llvm.mlir.constant(0 : i64) : i64
+    "gemmini.intr.flush"(%186, %186) : (i64, i64) -> ()
+    %187 = llvm.extractvalue %146[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    llvm.call @free(%187) : (!llvm.ptr) -> ()
+    %subview_5 = memref.subview %24[2, 0, 0] [1, 3, 3] [1, 1, 1] : memref<3x3x3xi8> to memref<3x3xi8, strided<[3, 1], offset: 18>>
+    %188 = builtin.unrealized_conversion_cast %subview_5 : memref<3x3xi8, strided<[3, 1], offset: 18>> to !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+    %subview_6 = memref.subview %46[2, 0, 0] [1, 3, 3] [1, 1, 1] : memref<3x3x3xi8> to memref<3x3xi8, strided<[3, 1], offset: 18>>
+    %189 = builtin.unrealized_conversion_cast %subview_6 : memref<3x3xi8, strided<[3, 1], offset: 18>> to !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+    %subview_7 = memref.subview %68[2, 0, 0] [1, 3, 3] [1, 1, 1] : memref<3x3x3xi8> to memref<3x3xi8, strided<[3, 1], offset: 18>>
+    %190 = builtin.unrealized_conversion_cast %subview_7 : memref<3x3xi8, strided<[3, 1], offset: 18>> to !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+    %191 = llvm.mlir.constant(3 : index) : i64
+    %192 = llvm.mlir.constant(3 : index) : i64
+    %193 = llvm.mlir.constant(1 : index) : i64
+    %194 = llvm.mlir.constant(9 : index) : i64
+    %195 = llvm.mlir.zero : !llvm.ptr
+    %196 = llvm.getelementptr %195[%194] : (!llvm.ptr, i64) -> !llvm.ptr, i32
+    %197 = llvm.ptrtoint %196 : !llvm.ptr to i64
+    %198 = llvm.call @malloc(%197) : (i64) -> !llvm.ptr
+    %199 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+    %200 = llvm.insertvalue %198, %199[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %201 = llvm.insertvalue %198, %200[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %202 = llvm.mlir.constant(0 : index) : i64
+    %203 = llvm.insertvalue %202, %201[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %204 = llvm.insertvalue %191, %203[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %205 = llvm.insertvalue %192, %204[3, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %206 = llvm.insertvalue %192, %205[4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %207 = llvm.insertvalue %193, %206[4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %208 = builtin.unrealized_conversion_cast %207 : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> to memref<3x3xi32>
+    %209 = llvm.mlir.constant(0 : i32) : i32
+    linalg.fill ins(%209 : i32) outs(%208 : memref<3x3xi32>)
+    %210 = llvm.extractvalue %188[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %211 = llvm.ptrtoint %210 : !llvm.ptr to i64
+    %212 = llvm.mlir.constant(18 : index) : i64
+    %213 = llvm.add %211, %212 : i64
+    %214 = llvm.extractvalue %189[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %215 = llvm.ptrtoint %214 : !llvm.ptr to i64
+    %216 = llvm.mlir.constant(18 : index) : i64
+    %217 = llvm.add %215, %216 : i64
+    %218 = llvm.extractvalue %190[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %219 = llvm.ptrtoint %218 : !llvm.ptr to i64
+    %220 = llvm.mlir.constant(18 : index) : i64
+    %221 = llvm.add %219, %220 : i64
+    %222 = llvm.extractvalue %207[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %223 = llvm.ptrtoint %222 : !llvm.ptr to i64
+    %224 = llvm.mlir.constant(4575657221408489476 : i64) : i64
+    %225 = llvm.mlir.constant(281474976710656 : i64) : i64
+    "gemmini.intr.config_ex"(%224, %225) : (i64, i64) -> ()
+    %226 = llvm.mlir.constant(3 : i64) : i64
+    %227 = llvm.mlir.constant(2 : i64) : i64
+    %228 = llvm.mlir.constant(4575657221408423939 : i64) : i64
+    "gemmini.intr.config_st"(%227, %228) : (i64, i64) -> ()
+    %229 = llvm.mlir.constant(3 : i64) : i64
+    %230 = llvm.mlir.constant(4575657221409472769 : i64) : i64
+    "gemmini.intr.config_ld"(%230, %229) : (i64, i64) -> ()
+    %231 = llvm.mlir.constant(3 : i64) : i64
+    %232 = llvm.mlir.constant(4575657221409472777 : i64) : i64
+    "gemmini.intr.config_ld"(%232, %231) : (i64, i64) -> ()
+    %233 = llvm.mlir.constant(12 : i64) : i64
+    %234 = llvm.mlir.constant(4575657221409472785 : i64) : i64
+    "gemmini.intr.config_ld"(%234, %233) : (i64, i64) -> ()
+    %235 = llvm.mlir.constant(0 : i64) : i64
+    %236 = llvm.mlir.constant(0 : i64) : i64
+    %237 = llvm.mlir.constant(0 : i64) : i64
+    %238 = llvm.mlir.constant(0 : i64) : i64
+    %239 = llvm.mlir.constant(55835426829 : i64) : i64
+    %240 = llvm.mlir.constant(4295032833 : i64) : i64
+    "gemmini.intr.loop_ws_config_bounds"(%239, %240) : (i64, i64) -> ()
+    "gemmini.intr.loop_ws_config_addrs_ab"(%213, %217) : (i64, i64) -> ()
+    "gemmini.intr.loop_ws_config_addrs_dc"(%223, %221) : (i64, i64) -> ()
+    %241 = llvm.mlir.constant(3 : i64) : i64
+    %242 = llvm.mlir.constant(3 : i64) : i64
+    "gemmini.intr.loop_ws_config_strides_ab"(%241, %242) : (i64, i64) -> ()
+    %243 = llvm.mlir.constant(3 : i64) : i64
+    %244 = llvm.mlir.constant(3 : i64) : i64
+    "gemmini.intr.loop_ws_config_strides_dc"(%243, %244) : (i64, i64) -> ()
+    %245 = llvm.mlir.constant(1 : i64) : i64
+    %246 = llvm.mlir.constant(0 : i64) : i64
+    "gemmini.intr.loop_ws"(%245, %246) : (i64, i64) -> ()
+    %247 = llvm.mlir.constant(0 : i64) : i64
+    "gemmini.intr.flush"(%247, %247) : (i64, i64) -> ()
+    %248 = llvm.extractvalue %207[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    llvm.call @free(%248) : (!llvm.ptr) -> ()
+    %249 = llvm.mlir.addressof @frmt_spec : !llvm.ptr
+    %250 = llvm.mlir.constant(0 : index) : i64
+    %251 = llvm.getelementptr %249[%250, %250] : (!llvm.ptr, i64, i64) -> !llvm.ptr, !llvm.array<4 x i8>
+    %252 = llvm.mlir.addressof @nl : !llvm.ptr
+    %253 = llvm.mlir.constant(0 : index) : i64
+    %254 = llvm.getelementptr %252[%253, %253] : (!llvm.ptr, i64, i64) -> !llvm.ptr, !llvm.array<2 x i8>
+    %255 = llvm.mlir.constant(0 : index) : i64
+    %256 = llvm.mlir.constant(3 : index) : i64
+    %257 = llvm.mlir.constant(1 : index) : i64
+    llvm.br ^bb1(%255 : i64)
+  ^bb1(%258: i64):  // 2 preds: ^bb0, ^bb8
+    %259 = llvm.icmp "slt" %258, %256 : i64
+    llvm.cond_br %259, ^bb2, ^bb9
+  ^bb2:  // pred: ^bb1
+    %260 = llvm.mlir.constant(0 : index) : i64
+    %261 = llvm.mlir.constant(3 : index) : i64
+    %262 = llvm.mlir.constant(1 : index) : i64
+    llvm.br ^bb3(%260 : i64)
+  ^bb3(%263: i64):  // 2 preds: ^bb2, ^bb7
+    %264 = llvm.icmp "slt" %263, %261 : i64
+    llvm.cond_br %264, ^bb4, ^bb8
+  ^bb4:  // pred: ^bb3
+    %265 = llvm.mlir.constant(0 : index) : i64
+    %266 = llvm.mlir.constant(3 : index) : i64
+    %267 = llvm.mlir.constant(1 : index) : i64
+    llvm.br ^bb5(%265 : i64)
+  ^bb5(%268: i64):  // 2 preds: ^bb4, ^bb6
+    %269 = llvm.icmp "slt" %268, %266 : i64
+    llvm.cond_br %269, ^bb6, ^bb7
+  ^bb6:  // pred: ^bb5
+    %270 = llvm.extractvalue %67[1] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> 
+    %271 = llvm.mlir.constant(9 : index) : i64
+    %272 = llvm.mul %258, %271 : i64
+    %273 = llvm.mlir.constant(3 : index) : i64
+    %274 = llvm.mul %263, %273 : i64
+    %275 = llvm.add %272, %274 : i64
+    %276 = llvm.add %275, %268 : i64
+    %277 = llvm.getelementptr %270[%276] : (!llvm.ptr, i64) -> !llvm.ptr, i8
+    %278 = llvm.load %277 : !llvm.ptr -> i8
+    %279 = llvm.sext %278 : i8 to i32
+    %280 = llvm.call @printf(%251, %279) vararg(!llvm.func<i32 (ptr, ...)>) : (!llvm.ptr, i32) -> i32
+    %281 = llvm.add %268, %267 : i64
+    llvm.br ^bb5(%281 : i64)
+  ^bb7:  // pred: ^bb5
+    %282 = llvm.call @printf(%254) vararg(!llvm.func<i32 (ptr, ...)>) : (!llvm.ptr) -> i32
+    %283 = llvm.add %263, %262 : i64
+    llvm.br ^bb3(%283 : i64)
+  ^bb8:  // pred: ^bb3
+    %284 = llvm.call @printf(%254) vararg(!llvm.func<i32 (ptr, ...)>) : (!llvm.ptr) -> i32
+    %285 = llvm.add %258, %257 : i64
+    llvm.br ^bb1(%285 : i64)
+  ^bb9:  // pred: ^bb1
+    %286 = llvm.extractvalue %67[0] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> 
+    llvm.call @free(%286) : (!llvm.ptr) -> ()
+    llvm.return %0 : i8
+  }
+}
+
+
diff --git a/experiments/gemmini/logs/conv_2d_nchw_fchw_f32.mlir.print-after-all.mlir b/experiments/gemmini/logs/conv_2d_nchw_fchw_f32.mlir.print-after-all.mlir
new file mode 100644
index 0000000..c372bc5
--- /dev/null
+++ b/experiments/gemmini/logs/conv_2d_nchw_fchw_f32.mlir.print-after-all.mlir
@@ -0,0 +1,594 @@
+// -----// IR Dump After (anonymous namespace)::LowerLinalgToGemminiPass (convert-linalg-to-gemmini) //----- //
+module {
+  memref.global "private" @input : memref<2x2x5x5xf32> = dense<[[[[1.000000e+00, 0.000000e+00, -1.000000e+00, 0.000000e+00, 1.000000e+00], [1.000000e+00, 0.000000e+00, -1.000000e+00, 0.000000e+00, 1.000000e+00], [1.000000e+00, 0.000000e+00, -1.000000e+00, 0.000000e+00, 1.000000e+00], [1.000000e+00, 0.000000e+00, -1.000000e+00, 0.000000e+00, 1.000000e+00], [-1.000000e+00, 0.000000e+00, 1.000000e+00, 0.000000e+00, -1.000000e+00]], [[-1.000000e+00, 0.000000e+00, 1.000000e+00, 0.000000e+00, -1.000000e+00], [-1.000000e+00, 0.000000e+00, 1.000000e+00, 0.000000e+00, -1.000000e+00], [-1.000000e+00, 0.000000e+00, 1.000000e+00, 0.000000e+00, -1.000000e+00], [-1.000000e+00, 0.000000e+00, 1.000000e+00, 0.000000e+00, -1.000000e+00], [-1.000000e+00, 0.000000e+00, 1.000000e+00, 0.000000e+00, -1.000000e+00]]], [[[1.000000e+00, 0.000000e+00, 2.000000e+00, 0.000000e+00, 1.000000e+00], [1.000000e+00, 0.000000e+00, 2.000000e+00, 0.000000e+00, 1.000000e+00], [1.000000e+00, 0.000000e+00, 2.000000e+00, 0.000000e+00, 1.000000e+00], [1.000000e+00, 0.000000e+00, 2.000000e+00, 0.000000e+00, 1.000000e+00], [-1.000000e+00, 0.000000e+00, 2.000000e+00, 0.000000e+00, -1.000000e+00]], [[-1.000000e+00, 0.000000e+00, 2.000000e+00, 0.000000e+00, -1.000000e+00], [-1.000000e+00, 0.000000e+00, 2.000000e+00, 0.000000e+00, -1.000000e+00], [-1.000000e+00, 0.000000e+00, 2.000000e+00, 0.000000e+00, -1.000000e+00], [-1.000000e+00, 0.000000e+00, 2.000000e+00, 0.000000e+00, -1.000000e+00], [-1.000000e+00, 0.000000e+00, 2.000000e+00, 0.000000e+00, -1.000000e+00]]]]>
+  memref.global "private" @weight : memref<2x2x3x3xf32> = dense<[[[[1.000000e+00, 2.000000e+00, 3.000000e+00], [3.000000e+00, 2.000000e+00, 1.000000e+00], [1.000000e+00, 2.000000e+00, 3.000000e+00]], [[3.000000e+00, 2.000000e+00, 1.000000e+00], [1.000000e+00, 2.000000e+00, 3.000000e+00], [3.000000e+00, 2.000000e+00, 1.000000e+00]]], [[[1.000000e+00, 2.000000e+00, 3.000000e+00], [3.000000e+00, 2.000000e+00, 1.000000e+00], [1.000000e+00, 2.000000e+00, 3.000000e+00]], [[3.000000e+00, 2.000000e+00, 1.000000e+00], [1.000000e+00, 2.000000e+00, 3.000000e+00], [3.000000e+00, 2.000000e+00, 1.000000e+00]]]]>
+  func.func @main() -> i8 {
+    %c0_i8 = arith.constant 0 : i8
+    %0 = memref.get_global @input : memref<2x2x5x5xf32>
+    %1 = memref.get_global @weight : memref<2x2x3x3xf32>
+    %alloc = memref.alloc() : memref<2x2x3x3xf32>
+    %alloc_0 = memref.alloc() : memref<2x5x5x2xf32>
+    %alloc_1 = memref.alloc() : memref<18x2xf32>
+    %alloc_2 = memref.alloc() : memref<2xi32>
+    %alloc_3 = memref.alloc() : memref<18x2xf32>
+    %c3_i64 = arith.constant 3 : i64
+    %c3 = arith.constant 3 : index
+    %c2 = arith.constant 2 : index
+    %c0 = arith.constant 0 : index
+    %c2_4 = arith.constant 2 : index
+    %c1 = arith.constant 1 : index
+    scf.for %arg0 = %c0 to %c2_4 step %c1 {
+      %c0_12 = arith.constant 0 : index
+      %c2_13 = arith.constant 2 : index
+      %c1_14 = arith.constant 1 : index
+      scf.for %arg1 = %c0_12 to %c2_13 step %c1_14 {
+        %c0_15 = arith.constant 0 : index
+        %c5 = arith.constant 5 : index
+        %c1_16 = arith.constant 1 : index
+        scf.for %arg2 = %c0_15 to %c5 step %c1_16 {
+          %c0_17 = arith.constant 0 : index
+          %c5_18 = arith.constant 5 : index
+          %c1_19 = arith.constant 1 : index
+          scf.for %arg3 = %c0_17 to %c5_18 step %c1_19 {
+            %2 = memref.load %0[%arg0, %arg1, %arg2, %arg3] : memref<2x2x5x5xf32>
+            memref.store %2, %alloc_0[%arg0, %arg2, %arg3, %arg1] : memref<2x5x5x2xf32>
+          }
+        }
+      }
+    }
+    %c0_5 = arith.constant 0 : index
+    %c2_6 = arith.constant 2 : index
+    %c1_7 = arith.constant 1 : index
+    scf.for %arg0 = %c0_5 to %c2_6 step %c1_7 {
+      %c0_12 = arith.constant 0 : index
+      %c2_13 = arith.constant 2 : index
+      %c1_14 = arith.constant 1 : index
+      scf.for %arg1 = %c0_12 to %c2_13 step %c1_14 {
+        %c0_15 = arith.constant 0 : index
+        %c3_16 = arith.constant 3 : index
+        %c1_17 = arith.constant 1 : index
+        scf.for %arg2 = %c0_15 to %c3_16 step %c1_17 {
+          %c0_18 = arith.constant 0 : index
+          %c3_19 = arith.constant 3 : index
+          %c1_20 = arith.constant 1 : index
+          scf.for %arg3 = %c0_18 to %c3_19 step %c1_20 {
+            %2 = arith.muli %arg2, %c3 : index
+            %3 = arith.muli %2, %c2 : index
+            %4 = arith.muli %arg3, %c2 : index
+            %5 = arith.addi %3, %4 : index
+            %6 = arith.addi %5, %arg1 : index
+            %7 = memref.load %1[%arg0, %arg1, %arg2, %arg3] : memref<2x2x3x3xf32>
+            memref.store %7, %alloc_1[%6, %arg0] : memref<18x2xf32>
+          }
+        }
+      }
+    }
+    %c3_i64_8 = arith.constant 3 : i64
+    gemmini.tile_conv %alloc_0 %alloc_1 %alloc_2 %alloc_3 %c3_i64 %c3_i64 %c3_i64_8 : memref<2x5x5x2xf32> memref<18x2xf32> memref<2xi32> memref<18x2xf32> i64 i64 i64
+    %c0_9 = arith.constant 0 : index
+    %c2_10 = arith.constant 2 : index
+    %c1_11 = arith.constant 1 : index
+    scf.for %arg0 = %c0_9 to %c2_10 step %c1_11 {
+      %c0_12 = arith.constant 0 : index
+      %c2_13 = arith.constant 2 : index
+      %c1_14 = arith.constant 1 : index
+      scf.for %arg1 = %c0_12 to %c2_13 step %c1_14 {
+        %c0_15 = arith.constant 0 : index
+        %c3_16 = arith.constant 3 : index
+        %c1_17 = arith.constant 1 : index
+        scf.for %arg2 = %c0_15 to %c3_16 step %c1_17 {
+          %c0_18 = arith.constant 0 : index
+          %c3_19 = arith.constant 3 : index
+          %c1_20 = arith.constant 1 : index
+          scf.for %arg3 = %c0_18 to %c3_19 step %c1_20 {
+            %c3_21 = arith.constant 3 : index
+            %2 = arith.muli %arg0, %c3_21 : index
+            %3 = arith.muli %2, %c3_21 : index
+            %4 = arith.muli %arg2, %c3_21 : index
+            %5 = arith.addi %3, %4 : index
+            %6 = arith.addi %5, %arg3 : index
+            %7 = memref.load %alloc_3[%6, %arg1] : memref<18x2xf32>
+            memref.store %7, %alloc[%arg0, %arg1, %arg2, %arg3] : memref<2x2x3x3xf32>
+          }
+        }
+      }
+    }
+    memref.dealloc %alloc_0 : memref<2x5x5x2xf32>
+    memref.dealloc %alloc_1 : memref<18x2xf32>
+    memref.dealloc %alloc_3 : memref<18x2xf32>
+    memref.dealloc %alloc_2 : memref<2xi32>
+    gemmini.print %alloc : memref<2x2x3x3xf32>
+    return %c0_i8 : i8
+  }
+}
+
+
+// -----// IR Dump After (anonymous namespace)::LowerGemminiToLLVMPass (lower-gemmini) //----- //
+module {
+  llvm.mlir.global internal constant @nl("\0A\00") {addr_space = 0 : i32}
+  llvm.mlir.global internal constant @frmt_spec("%f \00") {addr_space = 0 : i32}
+  llvm.func @printf(!llvm.ptr, ...) -> i32
+  llvm.func @free(!llvm.ptr)
+  llvm.func @malloc(i64) -> !llvm.ptr
+  llvm.mlir.global private @input(dense<[[[[1.000000e+00, 0.000000e+00, -1.000000e+00, 0.000000e+00, 1.000000e+00], [1.000000e+00, 0.000000e+00, -1.000000e+00, 0.000000e+00, 1.000000e+00], [1.000000e+00, 0.000000e+00, -1.000000e+00, 0.000000e+00, 1.000000e+00], [1.000000e+00, 0.000000e+00, -1.000000e+00, 0.000000e+00, 1.000000e+00], [-1.000000e+00, 0.000000e+00, 1.000000e+00, 0.000000e+00, -1.000000e+00]], [[-1.000000e+00, 0.000000e+00, 1.000000e+00, 0.000000e+00, -1.000000e+00], [-1.000000e+00, 0.000000e+00, 1.000000e+00, 0.000000e+00, -1.000000e+00], [-1.000000e+00, 0.000000e+00, 1.000000e+00, 0.000000e+00, -1.000000e+00], [-1.000000e+00, 0.000000e+00, 1.000000e+00, 0.000000e+00, -1.000000e+00], [-1.000000e+00, 0.000000e+00, 1.000000e+00, 0.000000e+00, -1.000000e+00]]], [[[1.000000e+00, 0.000000e+00, 2.000000e+00, 0.000000e+00, 1.000000e+00], [1.000000e+00, 0.000000e+00, 2.000000e+00, 0.000000e+00, 1.000000e+00], [1.000000e+00, 0.000000e+00, 2.000000e+00, 0.000000e+00, 1.000000e+00], [1.000000e+00, 0.000000e+00, 2.000000e+00, 0.000000e+00, 1.000000e+00], [-1.000000e+00, 0.000000e+00, 2.000000e+00, 0.000000e+00, -1.000000e+00]], [[-1.000000e+00, 0.000000e+00, 2.000000e+00, 0.000000e+00, -1.000000e+00], [-1.000000e+00, 0.000000e+00, 2.000000e+00, 0.000000e+00, -1.000000e+00], [-1.000000e+00, 0.000000e+00, 2.000000e+00, 0.000000e+00, -1.000000e+00], [-1.000000e+00, 0.000000e+00, 2.000000e+00, 0.000000e+00, -1.000000e+00], [-1.000000e+00, 0.000000e+00, 2.000000e+00, 0.000000e+00, -1.000000e+00]]]]> : tensor<2x2x5x5xf32>) {addr_space = 0 : i32} : !llvm.array<2 x array<2 x array<5 x array<5 x f32>>>>
+  llvm.mlir.global private @weight(dense<[[[[1.000000e+00, 2.000000e+00, 3.000000e+00], [3.000000e+00, 2.000000e+00, 1.000000e+00], [1.000000e+00, 2.000000e+00, 3.000000e+00]], [[3.000000e+00, 2.000000e+00, 1.000000e+00], [1.000000e+00, 2.000000e+00, 3.000000e+00], [3.000000e+00, 2.000000e+00, 1.000000e+00]]], [[[1.000000e+00, 2.000000e+00, 3.000000e+00], [3.000000e+00, 2.000000e+00, 1.000000e+00], [1.000000e+00, 2.000000e+00, 3.000000e+00]], [[3.000000e+00, 2.000000e+00, 1.000000e+00], [1.000000e+00, 2.000000e+00, 3.000000e+00], [3.000000e+00, 2.000000e+00, 1.000000e+00]]]]> : tensor<2x2x3x3xf32>) {addr_space = 0 : i32} : !llvm.array<2 x array<2 x array<3 x array<3 x f32>>>>
+  llvm.func @main() -> i8 {
+    %0 = llvm.mlir.constant(0 : i8) : i8
+    %1 = llvm.mlir.constant(2 : index) : i64
+    %2 = llvm.mlir.constant(2 : index) : i64
+    %3 = llvm.mlir.constant(5 : index) : i64
+    %4 = llvm.mlir.constant(5 : index) : i64
+    %5 = llvm.mlir.constant(1 : index) : i64
+    %6 = llvm.mlir.constant(25 : index) : i64
+    %7 = llvm.mlir.constant(50 : index) : i64
+    %8 = llvm.mlir.constant(100 : index) : i64
+    %9 = llvm.mlir.zero : !llvm.ptr
+    %10 = llvm.getelementptr %9[%8] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+    %11 = llvm.ptrtoint %10 : !llvm.ptr to i64
+    %12 = llvm.mlir.addressof @input : !llvm.ptr
+    %13 = llvm.getelementptr %12[0, 0, 0, 0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<2 x array<2 x array<5 x array<5 x f32>>>>
+    %14 = llvm.mlir.constant(3735928559 : index) : i64
+    %15 = llvm.inttoptr %14 : i64 to !llvm.ptr
+    %16 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
+    %17 = llvm.insertvalue %15, %16[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %18 = llvm.insertvalue %13, %17[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %19 = llvm.mlir.constant(0 : index) : i64
+    %20 = llvm.insertvalue %19, %18[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %21 = llvm.insertvalue %1, %20[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %22 = llvm.insertvalue %2, %21[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %23 = llvm.insertvalue %3, %22[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %24 = llvm.insertvalue %4, %23[3, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %25 = llvm.insertvalue %7, %24[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %26 = llvm.insertvalue %6, %25[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %27 = llvm.insertvalue %4, %26[4, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %28 = llvm.insertvalue %5, %27[4, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %29 = llvm.mlir.constant(2 : index) : i64
+    %30 = llvm.mlir.constant(2 : index) : i64
+    %31 = llvm.mlir.constant(3 : index) : i64
+    %32 = llvm.mlir.constant(3 : index) : i64
+    %33 = llvm.mlir.constant(1 : index) : i64
+    %34 = llvm.mlir.constant(9 : index) : i64
+    %35 = llvm.mlir.constant(18 : index) : i64
+    %36 = llvm.mlir.constant(36 : index) : i64
+    %37 = llvm.mlir.zero : !llvm.ptr
+    %38 = llvm.getelementptr %37[%36] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+    %39 = llvm.ptrtoint %38 : !llvm.ptr to i64
+    %40 = llvm.mlir.addressof @weight : !llvm.ptr
+    %41 = llvm.getelementptr %40[0, 0, 0, 0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<2 x array<2 x array<3 x array<3 x f32>>>>
+    %42 = llvm.mlir.constant(3735928559 : index) : i64
+    %43 = llvm.inttoptr %42 : i64 to !llvm.ptr
+    %44 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
+    %45 = llvm.insertvalue %43, %44[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %46 = llvm.insertvalue %41, %45[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %47 = llvm.mlir.constant(0 : index) : i64
+    %48 = llvm.insertvalue %47, %46[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %49 = llvm.insertvalue %29, %48[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %50 = llvm.insertvalue %30, %49[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %51 = llvm.insertvalue %31, %50[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %52 = llvm.insertvalue %32, %51[3, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %53 = llvm.insertvalue %35, %52[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %54 = llvm.insertvalue %34, %53[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %55 = llvm.insertvalue %32, %54[4, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %56 = llvm.insertvalue %33, %55[4, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %57 = llvm.mlir.constant(2 : index) : i64
+    %58 = llvm.mlir.constant(2 : index) : i64
+    %59 = llvm.mlir.constant(3 : index) : i64
+    %60 = llvm.mlir.constant(3 : index) : i64
+    %61 = llvm.mlir.constant(1 : index) : i64
+    %62 = llvm.mlir.constant(9 : index) : i64
+    %63 = llvm.mlir.constant(18 : index) : i64
+    %64 = llvm.mlir.constant(36 : index) : i64
+    %65 = llvm.mlir.zero : !llvm.ptr
+    %66 = llvm.getelementptr %65[%64] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+    %67 = llvm.ptrtoint %66 : !llvm.ptr to i64
+    %68 = llvm.call @malloc(%67) : (i64) -> !llvm.ptr
+    %69 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
+    %70 = llvm.insertvalue %68, %69[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %71 = llvm.insertvalue %68, %70[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %72 = llvm.mlir.constant(0 : index) : i64
+    %73 = llvm.insertvalue %72, %71[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %74 = llvm.insertvalue %57, %73[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %75 = llvm.insertvalue %58, %74[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %76 = llvm.insertvalue %59, %75[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %77 = llvm.insertvalue %60, %76[3, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %78 = llvm.insertvalue %63, %77[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %79 = llvm.insertvalue %62, %78[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %80 = llvm.insertvalue %60, %79[4, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %81 = llvm.insertvalue %61, %80[4, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %82 = llvm.mlir.constant(2 : index) : i64
+    %83 = llvm.mlir.constant(5 : index) : i64
+    %84 = llvm.mlir.constant(5 : index) : i64
+    %85 = llvm.mlir.constant(2 : index) : i64
+    %86 = llvm.mlir.constant(1 : index) : i64
+    %87 = llvm.mlir.constant(10 : index) : i64
+    %88 = llvm.mlir.constant(50 : index) : i64
+    %89 = llvm.mlir.constant(100 : index) : i64
+    %90 = llvm.mlir.zero : !llvm.ptr
+    %91 = llvm.getelementptr %90[%89] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+    %92 = llvm.ptrtoint %91 : !llvm.ptr to i64
+    %93 = llvm.call @malloc(%92) : (i64) -> !llvm.ptr
+    %94 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
+    %95 = llvm.insertvalue %93, %94[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %96 = llvm.insertvalue %93, %95[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %97 = llvm.mlir.constant(0 : index) : i64
+    %98 = llvm.insertvalue %97, %96[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %99 = llvm.insertvalue %82, %98[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %100 = llvm.insertvalue %83, %99[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %101 = llvm.insertvalue %84, %100[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %102 = llvm.insertvalue %85, %101[3, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %103 = llvm.insertvalue %88, %102[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %104 = llvm.insertvalue %87, %103[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %105 = llvm.insertvalue %85, %104[4, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %106 = llvm.insertvalue %86, %105[4, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %107 = llvm.mlir.constant(18 : index) : i64
+    %108 = llvm.mlir.constant(2 : index) : i64
+    %109 = llvm.mlir.constant(1 : index) : i64
+    %110 = llvm.mlir.constant(36 : index) : i64
+    %111 = llvm.mlir.zero : !llvm.ptr
+    %112 = llvm.getelementptr %111[%110] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+    %113 = llvm.ptrtoint %112 : !llvm.ptr to i64
+    %114 = llvm.call @malloc(%113) : (i64) -> !llvm.ptr
+    %115 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+    %116 = llvm.insertvalue %114, %115[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %117 = llvm.insertvalue %114, %116[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %118 = llvm.mlir.constant(0 : index) : i64
+    %119 = llvm.insertvalue %118, %117[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %120 = llvm.insertvalue %107, %119[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %121 = llvm.insertvalue %108, %120[3, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %122 = llvm.insertvalue %108, %121[4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %123 = llvm.insertvalue %109, %122[4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %124 = llvm.mlir.constant(2 : index) : i64
+    %125 = llvm.mlir.constant(1 : index) : i64
+    %126 = llvm.mlir.zero : !llvm.ptr
+    %127 = llvm.getelementptr %126[%124] : (!llvm.ptr, i64) -> !llvm.ptr, i32
+    %128 = llvm.ptrtoint %127 : !llvm.ptr to i64
+    %129 = llvm.call @malloc(%128) : (i64) -> !llvm.ptr
+    %130 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+    %131 = llvm.insertvalue %129, %130[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
+    %132 = llvm.insertvalue %129, %131[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
+    %133 = llvm.mlir.constant(0 : index) : i64
+    %134 = llvm.insertvalue %133, %132[2] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
+    %135 = llvm.insertvalue %124, %134[3, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
+    %136 = llvm.insertvalue %125, %135[4, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
+    %137 = llvm.mlir.constant(18 : index) : i64
+    %138 = llvm.mlir.constant(2 : index) : i64
+    %139 = llvm.mlir.constant(1 : index) : i64
+    %140 = llvm.mlir.constant(36 : index) : i64
+    %141 = llvm.mlir.zero : !llvm.ptr
+    %142 = llvm.getelementptr %141[%140] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+    %143 = llvm.ptrtoint %142 : !llvm.ptr to i64
+    %144 = llvm.call @malloc(%143) : (i64) -> !llvm.ptr
+    %145 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+    %146 = llvm.insertvalue %144, %145[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %147 = llvm.insertvalue %144, %146[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %148 = llvm.mlir.constant(0 : index) : i64
+    %149 = llvm.insertvalue %148, %147[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %150 = llvm.insertvalue %137, %149[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %151 = llvm.insertvalue %138, %150[3, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %152 = llvm.insertvalue %138, %151[4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %153 = llvm.insertvalue %139, %152[4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %154 = llvm.mlir.constant(3 : i64) : i64
+    %155 = llvm.mlir.constant(3 : index) : i64
+    %156 = llvm.mlir.constant(2 : index) : i64
+    %157 = llvm.mlir.constant(0 : index) : i64
+    %158 = llvm.mlir.constant(2 : index) : i64
+    %159 = llvm.mlir.constant(1 : index) : i64
+    llvm.br ^bb1(%157 : i64)
+  ^bb1(%160: i64):  // 2 preds: ^bb0, ^bb11
+    %161 = llvm.icmp "slt" %160, %158 : i64
+    llvm.cond_br %161, ^bb2, ^bb12
+  ^bb2:  // pred: ^bb1
+    %162 = llvm.mlir.constant(0 : index) : i64
+    %163 = llvm.mlir.constant(2 : index) : i64
+    %164 = llvm.mlir.constant(1 : index) : i64
+    llvm.br ^bb3(%162 : i64)
+  ^bb3(%165: i64):  // 2 preds: ^bb2, ^bb10
+    %166 = llvm.icmp "slt" %165, %163 : i64
+    llvm.cond_br %166, ^bb4, ^bb11
+  ^bb4:  // pred: ^bb3
+    %167 = llvm.mlir.constant(0 : index) : i64
+    %168 = llvm.mlir.constant(5 : index) : i64
+    %169 = llvm.mlir.constant(1 : index) : i64
+    llvm.br ^bb5(%167 : i64)
+  ^bb5(%170: i64):  // 2 preds: ^bb4, ^bb9
+    %171 = llvm.icmp "slt" %170, %168 : i64
+    llvm.cond_br %171, ^bb6, ^bb10
+  ^bb6:  // pred: ^bb5
+    %172 = llvm.mlir.constant(0 : index) : i64
+    %173 = llvm.mlir.constant(5 : index) : i64
+    %174 = llvm.mlir.constant(1 : index) : i64
+    llvm.br ^bb7(%172 : i64)
+  ^bb7(%175: i64):  // 2 preds: ^bb6, ^bb8
+    %176 = llvm.icmp "slt" %175, %173 : i64
+    llvm.cond_br %176, ^bb8, ^bb9
+  ^bb8:  // pred: ^bb7
+    %177 = llvm.extractvalue %28[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %178 = llvm.mlir.constant(50 : index) : i64
+    %179 = llvm.mul %160, %178 : i64
+    %180 = llvm.mlir.constant(25 : index) : i64
+    %181 = llvm.mul %165, %180 : i64
+    %182 = llvm.add %179, %181 : i64
+    %183 = llvm.mlir.constant(5 : index) : i64
+    %184 = llvm.mul %170, %183 : i64
+    %185 = llvm.add %182, %184 : i64
+    %186 = llvm.add %185, %175 : i64
+    %187 = llvm.getelementptr %177[%186] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+    %188 = llvm.load %187 : !llvm.ptr -> f32
+    %189 = llvm.extractvalue %106[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %190 = llvm.mlir.constant(50 : index) : i64
+    %191 = llvm.mul %160, %190 : i64
+    %192 = llvm.mlir.constant(10 : index) : i64
+    %193 = llvm.mul %170, %192 : i64
+    %194 = llvm.add %191, %193 : i64
+    %195 = llvm.mlir.constant(2 : index) : i64
+    %196 = llvm.mul %175, %195 : i64
+    %197 = llvm.add %194, %196 : i64
+    %198 = llvm.add %197, %165 : i64
+    %199 = llvm.getelementptr %189[%198] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+    llvm.store %188, %199 : f32, !llvm.ptr
+    %200 = llvm.add %175, %174 : i64
+    llvm.br ^bb7(%200 : i64)
+  ^bb9:  // pred: ^bb7
+    %201 = llvm.add %170, %169 : i64
+    llvm.br ^bb5(%201 : i64)
+  ^bb10:  // pred: ^bb5
+    %202 = llvm.add %165, %164 : i64
+    llvm.br ^bb3(%202 : i64)
+  ^bb11:  // pred: ^bb3
+    %203 = llvm.add %160, %159 : i64
+    llvm.br ^bb1(%203 : i64)
+  ^bb12:  // pred: ^bb1
+    %204 = llvm.mlir.constant(0 : index) : i64
+    %205 = llvm.mlir.constant(2 : index) : i64
+    %206 = llvm.mlir.constant(1 : index) : i64
+    llvm.br ^bb13(%204 : i64)
+  ^bb13(%207: i64):  // 2 preds: ^bb12, ^bb23
+    %208 = llvm.icmp "slt" %207, %205 : i64
+    llvm.cond_br %208, ^bb14, ^bb24
+  ^bb14:  // pred: ^bb13
+    %209 = llvm.mlir.constant(0 : index) : i64
+    %210 = llvm.mlir.constant(2 : index) : i64
+    %211 = llvm.mlir.constant(1 : index) : i64
+    llvm.br ^bb15(%209 : i64)
+  ^bb15(%212: i64):  // 2 preds: ^bb14, ^bb22
+    %213 = llvm.icmp "slt" %212, %210 : i64
+    llvm.cond_br %213, ^bb16, ^bb23
+  ^bb16:  // pred: ^bb15
+    %214 = llvm.mlir.constant(0 : index) : i64
+    %215 = llvm.mlir.constant(3 : index) : i64
+    %216 = llvm.mlir.constant(1 : index) : i64
+    llvm.br ^bb17(%214 : i64)
+  ^bb17(%217: i64):  // 2 preds: ^bb16, ^bb21
+    %218 = llvm.icmp "slt" %217, %215 : i64
+    llvm.cond_br %218, ^bb18, ^bb22
+  ^bb18:  // pred: ^bb17
+    %219 = llvm.mlir.constant(0 : index) : i64
+    %220 = llvm.mlir.constant(3 : index) : i64
+    %221 = llvm.mlir.constant(1 : index) : i64
+    llvm.br ^bb19(%219 : i64)
+  ^bb19(%222: i64):  // 2 preds: ^bb18, ^bb20
+    %223 = llvm.icmp "slt" %222, %220 : i64
+    llvm.cond_br %223, ^bb20, ^bb21
+  ^bb20:  // pred: ^bb19
+    %224 = llvm.mul %217, %155 : i64
+    %225 = llvm.mul %224, %156 : i64
+    %226 = llvm.mul %222, %156 : i64
+    %227 = llvm.add %225, %226 : i64
+    %228 = llvm.add %227, %212 : i64
+    %229 = llvm.extractvalue %56[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %230 = llvm.mlir.constant(18 : index) : i64
+    %231 = llvm.mul %207, %230 : i64
+    %232 = llvm.mlir.constant(9 : index) : i64
+    %233 = llvm.mul %212, %232 : i64
+    %234 = llvm.add %231, %233 : i64
+    %235 = llvm.mlir.constant(3 : index) : i64
+    %236 = llvm.mul %217, %235 : i64
+    %237 = llvm.add %234, %236 : i64
+    %238 = llvm.add %237, %222 : i64
+    %239 = llvm.getelementptr %229[%238] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+    %240 = llvm.load %239 : !llvm.ptr -> f32
+    %241 = llvm.extractvalue %123[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %242 = llvm.mlir.constant(2 : index) : i64
+    %243 = llvm.mul %228, %242 : i64
+    %244 = llvm.add %243, %207 : i64
+    %245 = llvm.getelementptr %241[%244] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+    llvm.store %240, %245 : f32, !llvm.ptr
+    %246 = llvm.add %222, %221 : i64
+    llvm.br ^bb19(%246 : i64)
+  ^bb21:  // pred: ^bb19
+    %247 = llvm.add %217, %216 : i64
+    llvm.br ^bb17(%247 : i64)
+  ^bb22:  // pred: ^bb17
+    %248 = llvm.add %212, %211 : i64
+    llvm.br ^bb15(%248 : i64)
+  ^bb23:  // pred: ^bb15
+    %249 = llvm.add %207, %206 : i64
+    llvm.br ^bb13(%249 : i64)
+  ^bb24:  // pred: ^bb13
+    %250 = llvm.mlir.constant(3 : i64) : i64
+    %251 = llvm.extractvalue %106[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %252 = llvm.ptrtoint %251 : !llvm.ptr to i64
+    %253 = llvm.extractvalue %153[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %254 = llvm.ptrtoint %253 : !llvm.ptr to i64
+    %255 = llvm.extractvalue %136[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
+    %256 = llvm.ptrtoint %255 : !llvm.ptr to i64
+    %257 = llvm.extractvalue %123[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %258 = llvm.ptrtoint %257 : !llvm.ptr to i64
+    %259 = llvm.mlir.constant(2 : i64) : i64
+    %260 = llvm.mlir.constant(2 : i64) : i64
+    %261 = llvm.mlir.constant(4575657221408423938 : i64) : i64
+    "gemmini.intr.config_st"(%260, %261) : (i64, i64) -> ()
+    %262 = llvm.mlir.constant(65540 : i64) : i64
+    %263 = llvm.mlir.constant(281474976710656 : i64) : i64
+    "gemmini.intr.config_ex"(%262, %263) : (i64, i64) -> ()
+    %264 = llvm.mlir.constant(0 : i64) : i64
+    %265 = llvm.mlir.constant(0 : i64) : i64
+    %266 = llvm.mlir.constant(0 : i64) : i64
+    %267 = llvm.mlir.constant(0 : i64) : i64
+    %268 = llvm.mlir.constant(562958543683586 : i64) : i64
+    %269 = llvm.mlir.constant(4295163907 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config1"(%268, %269) : (i64, i64) -> ()
+    %270 = llvm.mlir.constant(844429225164800 : i64) : i64
+    %271 = llvm.mlir.constant(562962838519810 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config2"(%270, %271) : (i64, i64) -> ()
+    %272 = llvm.mlir.constant(844437815164928 : i64) : i64
+    %273 = llvm.mlir.constant(0 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config3"(%272, %273) : (i64, i64) -> ()
+    %274 = llvm.mlir.constant(844424930131968 : i64) : i64
+    %275 = llvm.mlir.constant(65539 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config4"(%274, %275) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config5"(%258, %254) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config6"(%256, %252) : (i64, i64) -> ()
+    %276 = llvm.mlir.constant(768 : i64) : i64
+    %277 = llvm.mlir.constant(1 : i64) : i64
+    "gemmini.intr.loop_conv_ws"(%276, %277) : (i64, i64) -> ()
+    %278 = llvm.mlir.constant(0 : i64) : i64
+    "gemmini.intr.flush"(%278, %278) : (i64, i64) -> ()
+    %279 = llvm.mlir.constant(0 : index) : i64
+    %280 = llvm.mlir.constant(2 : index) : i64
+    %281 = llvm.mlir.constant(1 : index) : i64
+    llvm.br ^bb25(%279 : i64)
+  ^bb25(%282: i64):  // 2 preds: ^bb24, ^bb35
+    %283 = llvm.icmp "slt" %282, %280 : i64
+    llvm.cond_br %283, ^bb26, ^bb36
+  ^bb26:  // pred: ^bb25
+    %284 = llvm.mlir.constant(0 : index) : i64
+    %285 = llvm.mlir.constant(2 : index) : i64
+    %286 = llvm.mlir.constant(1 : index) : i64
+    llvm.br ^bb27(%284 : i64)
+  ^bb27(%287: i64):  // 2 preds: ^bb26, ^bb34
+    %288 = llvm.icmp "slt" %287, %285 : i64
+    llvm.cond_br %288, ^bb28, ^bb35
+  ^bb28:  // pred: ^bb27
+    %289 = llvm.mlir.constant(0 : index) : i64
+    %290 = llvm.mlir.constant(3 : index) : i64
+    %291 = llvm.mlir.constant(1 : index) : i64
+    llvm.br ^bb29(%289 : i64)
+  ^bb29(%292: i64):  // 2 preds: ^bb28, ^bb33
+    %293 = llvm.icmp "slt" %292, %290 : i64
+    llvm.cond_br %293, ^bb30, ^bb34
+  ^bb30:  // pred: ^bb29
+    %294 = llvm.mlir.constant(0 : index) : i64
+    %295 = llvm.mlir.constant(3 : index) : i64
+    %296 = llvm.mlir.constant(1 : index) : i64
+    llvm.br ^bb31(%294 : i64)
+  ^bb31(%297: i64):  // 2 preds: ^bb30, ^bb32
+    %298 = llvm.icmp "slt" %297, %295 : i64
+    llvm.cond_br %298, ^bb32, ^bb33
+  ^bb32:  // pred: ^bb31
+    %299 = llvm.mlir.constant(3 : index) : i64
+    %300 = llvm.mul %282, %299 : i64
+    %301 = llvm.mul %300, %299 : i64
+    %302 = llvm.mul %292, %299 : i64
+    %303 = llvm.add %301, %302 : i64
+    %304 = llvm.add %303, %297 : i64
+    %305 = llvm.extractvalue %153[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %306 = llvm.mlir.constant(2 : index) : i64
+    %307 = llvm.mul %304, %306 : i64
+    %308 = llvm.add %307, %287 : i64
+    %309 = llvm.getelementptr %305[%308] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+    %310 = llvm.load %309 : !llvm.ptr -> f32
+    %311 = llvm.extractvalue %81[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %312 = llvm.mlir.constant(18 : index) : i64
+    %313 = llvm.mul %282, %312 : i64
+    %314 = llvm.mlir.constant(9 : index) : i64
+    %315 = llvm.mul %287, %314 : i64
+    %316 = llvm.add %313, %315 : i64
+    %317 = llvm.mlir.constant(3 : index) : i64
+    %318 = llvm.mul %292, %317 : i64
+    %319 = llvm.add %316, %318 : i64
+    %320 = llvm.add %319, %297 : i64
+    %321 = llvm.getelementptr %311[%320] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+    llvm.store %310, %321 : f32, !llvm.ptr
+    %322 = llvm.add %297, %296 : i64
+    llvm.br ^bb31(%322 : i64)
+  ^bb33:  // pred: ^bb31
+    %323 = llvm.add %292, %291 : i64
+    llvm.br ^bb29(%323 : i64)
+  ^bb34:  // pred: ^bb29
+    %324 = llvm.add %287, %286 : i64
+    llvm.br ^bb27(%324 : i64)
+  ^bb35:  // pred: ^bb27
+    %325 = llvm.add %282, %281 : i64
+    llvm.br ^bb25(%325 : i64)
+  ^bb36:  // pred: ^bb25
+    %326 = llvm.extractvalue %106[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    llvm.call @free(%326) : (!llvm.ptr) -> ()
+    %327 = llvm.extractvalue %123[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    llvm.call @free(%327) : (!llvm.ptr) -> ()
+    %328 = llvm.extractvalue %153[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    llvm.call @free(%328) : (!llvm.ptr) -> ()
+    %329 = llvm.extractvalue %136[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
+    llvm.call @free(%329) : (!llvm.ptr) -> ()
+    %330 = llvm.mlir.addressof @frmt_spec : !llvm.ptr
+    %331 = llvm.mlir.constant(0 : index) : i64
+    %332 = llvm.getelementptr %330[%331, %331] : (!llvm.ptr, i64, i64) -> !llvm.ptr, !llvm.array<4 x i8>
+    %333 = llvm.mlir.addressof @nl : !llvm.ptr
+    %334 = llvm.mlir.constant(0 : index) : i64
+    %335 = llvm.getelementptr %333[%334, %334] : (!llvm.ptr, i64, i64) -> !llvm.ptr, !llvm.array<2 x i8>
+    %336 = llvm.mlir.constant(0 : index) : i64
+    %337 = llvm.mlir.constant(2 : index) : i64
+    %338 = llvm.mlir.constant(1 : index) : i64
+    llvm.br ^bb37(%336 : i64)
+  ^bb37(%339: i64):  // 2 preds: ^bb36, ^bb47
+    %340 = llvm.icmp "slt" %339, %337 : i64
+    llvm.cond_br %340, ^bb38, ^bb48
+  ^bb38:  // pred: ^bb37
+    %341 = llvm.mlir.constant(0 : index) : i64
+    %342 = llvm.mlir.constant(2 : index) : i64
+    %343 = llvm.mlir.constant(1 : index) : i64
+    llvm.br ^bb39(%341 : i64)
+  ^bb39(%344: i64):  // 2 preds: ^bb38, ^bb46
+    %345 = llvm.icmp "slt" %344, %342 : i64
+    llvm.cond_br %345, ^bb40, ^bb47
+  ^bb40:  // pred: ^bb39
+    %346 = llvm.mlir.constant(0 : index) : i64
+    %347 = llvm.mlir.constant(3 : index) : i64
+    %348 = llvm.mlir.constant(1 : index) : i64
+    llvm.br ^bb41(%346 : i64)
+  ^bb41(%349: i64):  // 2 preds: ^bb40, ^bb45
+    %350 = llvm.icmp "slt" %349, %347 : i64
+    llvm.cond_br %350, ^bb42, ^bb46
+  ^bb42:  // pred: ^bb41
+    %351 = llvm.mlir.constant(0 : index) : i64
+    %352 = llvm.mlir.constant(3 : index) : i64
+    %353 = llvm.mlir.constant(1 : index) : i64
+    llvm.br ^bb43(%351 : i64)
+  ^bb43(%354: i64):  // 2 preds: ^bb42, ^bb44
+    %355 = llvm.icmp "slt" %354, %352 : i64
+    llvm.cond_br %355, ^bb44, ^bb45
+  ^bb44:  // pred: ^bb43
+    %356 = llvm.extractvalue %81[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %357 = llvm.mlir.constant(18 : index) : i64
+    %358 = llvm.mul %339, %357 : i64
+    %359 = llvm.mlir.constant(9 : index) : i64
+    %360 = llvm.mul %344, %359 : i64
+    %361 = llvm.add %358, %360 : i64
+    %362 = llvm.mlir.constant(3 : index) : i64
+    %363 = llvm.mul %349, %362 : i64
+    %364 = llvm.add %361, %363 : i64
+    %365 = llvm.add %364, %354 : i64
+    %366 = llvm.getelementptr %356[%365] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+    %367 = llvm.load %366 : !llvm.ptr -> f32
+    %368 = llvm.fpext %367 : f32 to f64
+    %369 = llvm.call @printf(%332, %368) vararg(!llvm.func<i32 (ptr, ...)>) : (!llvm.ptr, f64) -> i32
+    %370 = llvm.add %354, %353 : i64
+    llvm.br ^bb43(%370 : i64)
+  ^bb45:  // pred: ^bb43
+    %371 = llvm.call @printf(%335) vararg(!llvm.func<i32 (ptr, ...)>) : (!llvm.ptr) -> i32
+    %372 = llvm.add %349, %348 : i64
+    llvm.br ^bb41(%372 : i64)
+  ^bb46:  // pred: ^bb41
+    %373 = llvm.call @printf(%335) vararg(!llvm.func<i32 (ptr, ...)>) : (!llvm.ptr) -> i32
+    %374 = llvm.add %344, %343 : i64
+    llvm.br ^bb39(%374 : i64)
+  ^bb47:  // pred: ^bb39
+    %375 = llvm.call @printf(%335) vararg(!llvm.func<i32 (ptr, ...)>) : (!llvm.ptr) -> i32
+    %376 = llvm.add %339, %338 : i64
+    llvm.br ^bb37(%376 : i64)
+  ^bb48:  // pred: ^bb37
+    llvm.return %0 : i8
+  }
+}
+
+

From 5c7aad33632a83d429593c878822d0b106946a8e Mon Sep 17 00:00:00 2001
From: sparsh <sparshsingh@berkeley.edu>
Date: Tue, 9 Dec 2025 01:59:05 -0800
Subject: [PATCH 03/13] Add helper script to dump Gemmini lowering IR

---
 experiments/gemmini/scripts/dump_one.sh | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)
 create mode 100755 experiments/gemmini/scripts/dump_one.sh

diff --git a/experiments/gemmini/scripts/dump_one.sh b/experiments/gemmini/scripts/dump_one.sh
new file mode 100755
index 0000000..eb68498
--- /dev/null
+++ b/experiments/gemmini/scripts/dump_one.sh
@@ -0,0 +1,19 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+BUDDY_OPT="${BUDDY_OPT:-$HOME/work/buddy-mlir/build/bin/buddy-opt}"
+IN="$1"
+OUTDIR="${2:-experiments/gemmini/logs}"
+mkdir -p "$OUTDIR"
+
+base="$(basename "$IN")"
+LOG="$OUTDIR/${base}.print-after-all.mlir"
+
+"$BUDDY_OPT" "$IN" \
+  --convert-linalg-to-gemmini \
+  --lower-gemmini \
+  --mlir-print-ir-after-all \
+  2> "$LOG" > /dev/null
+
+echo "Wrote: $LOG"
+grep -n "gemmini\\." "$LOG" | head -n 20 || true

From 8f3a9428e8db3afa9a2e6ccab89f91bb172f4a66 Mon Sep 17 00:00:00 2001
From: sparsh <sparshsingh@berkeley.edu>
Date: Tue, 9 Dec 2025 02:01:16 -0800
Subject: [PATCH 04/13] Add IR dump for tile-matmul-ws-softmax.mlir

---
 .../inputs/tile-matmul-ws-softmax.mlir        |  49 +++
 ...atmul-ws-softmax.mlir.print-after-all.mlir | 323 ++++++++++++++++++
 2 files changed, 372 insertions(+)
 create mode 100644 experiments/gemmini/inputs/tile-matmul-ws-softmax.mlir
 create mode 100644 experiments/gemmini/logs/tile-matmul-ws-softmax.mlir.print-after-all.mlir

diff --git a/experiments/gemmini/inputs/tile-matmul-ws-softmax.mlir b/experiments/gemmini/inputs/tile-matmul-ws-softmax.mlir
new file mode 100644
index 0000000..c81bccc
--- /dev/null
+++ b/experiments/gemmini/inputs/tile-matmul-ws-softmax.mlir
@@ -0,0 +1,49 @@
+// RUN: buddy-opt %s \
+// RUN:     --lower-gemmini | \
+// RUN: FileCheck %s
+
+memref.global "private" @g1 : memref<5x5xi8> = dense<[[1, 0, 0, 1, 0], [1, -1, 1, 0, 0], [-1, 0, 1, -1, 1], [1, 0, 0, 1, 0], [-1, 0, 0, -1, 0]]>
+memref.global "private" @g2 : memref<5x5xi8> = dense<[[1, -1, 0, 0, 1], [1, 0, -1, 0, -1], [-1, -1, 0, -1, 1], [-1, 0, 0, 1, 0], [1, 0, 0, -1, 0]]>
+
+
+func.func @main() -> i8 {
+  %i0 = arith.constant 0 : i8
+  %i1I8 = arith.constant 1 : i8
+  %minus1 = arith.constant -2 : i8
+  %i2I8 = arith.constant 2 : i8
+  %i2I32 = arith.constant 2 : i32
+  %dI32 = arith.constant 0 : i32
+  %c0 = arith.constant 0 : index 
+  %c1 = arith.constant 1 : index
+  %aArray = memref.get_global @g1 : memref<5x5xi8>
+  %bArray = memref.get_global @g2 : memref<5x5xi8>
+  %cArray = memref.alloc()  : memref<5x5xi8>
+  %dArray = memref.alloc()  : memref<5x5xi32>
+  %dim_I = memref.dim %aArray, %c0 : memref<5x5xi8>
+  %dim_J = memref.dim %bArray, %c1 : memref<5x5xi8>
+  %dim_K = memref.dim %aArray, %c1 : memref<5x5xi8>
+
+  scf.for %i3 = %c0 to %dim_I step %c1 {
+    scf.for %j3 = %c0 to %dim_J step %c1 {
+      memref.store %dI32, %dArray[%i3, %j3] : memref<5x5xi32>
+    }
+  }
+  
+  gemmini.tile_matmul %aArray %bArray %cArray %dArray {dataflow=1}: memref<5x5xi8> memref<5x5xi8> memref<5x5xi8> memref<5x5xi32>
+  gemmini.print %cArray : memref<5x5xi8>
+
+  // CHECK: "gemmini.intr.config_ex"
+  // CHECK: "gemmini.intr.config_st"
+  // CHECK: "gemmini.intr.config_ld"
+  // CHECK: "gemmini.intr.config_norm"
+  // CHECK: "gemmini.intr.loop_ws_config_bounds"
+  // CHECK: "gemmini.intr.loop_ws_config_addrs_ab"
+  // CHECK: "gemmini.intr.loop_ws_config_addrs_dc"
+  // CHECK: "gemmini.intr.loop_ws_config_strides_ab"
+  // CHECK: "gemmini.intr.loop_ws_config_strides_dc"
+  // CHECK: "gemmini.intr.loop_ws"
+  // CHECk: "gemmini.intr.flush"
+  gemmini.tile_matmul %aArray %bArray %cArray %dArray {dataflow=1, act=4, bertScale=0.05:f32}: memref<5x5xi8> memref<5x5xi8> memref<5x5xi8> memref<5x5xi32>
+  gemmini.print %cArray : memref<5x5xi8>
+  return %i0 : i8
+}
diff --git a/experiments/gemmini/logs/tile-matmul-ws-softmax.mlir.print-after-all.mlir b/experiments/gemmini/logs/tile-matmul-ws-softmax.mlir.print-after-all.mlir
new file mode 100644
index 0000000..abf64aa
--- /dev/null
+++ b/experiments/gemmini/logs/tile-matmul-ws-softmax.mlir.print-after-all.mlir
@@ -0,0 +1,323 @@
+// -----// IR Dump After (anonymous namespace)::LowerLinalgToGemminiPass (convert-linalg-to-gemmini) //----- //
+module {
+  memref.global "private" @g1 : memref<5x5xi8> = dense<[[1, 0, 0, 1, 0], [1, -1, 1, 0, 0], [-1, 0, 1, -1, 1], [1, 0, 0, 1, 0], [-1, 0, 0, -1, 0]]>
+  memref.global "private" @g2 : memref<5x5xi8> = dense<[[1, -1, 0, 0, 1], [1, 0, -1, 0, -1], [-1, -1, 0, -1, 1], [-1, 0, 0, 1, 0], [1, 0, 0, -1, 0]]>
+  func.func @main() -> i8 {
+    %c0_i8 = arith.constant 0 : i8
+    %c1_i8 = arith.constant 1 : i8
+    %c-2_i8 = arith.constant -2 : i8
+    %c2_i8 = arith.constant 2 : i8
+    %c2_i32 = arith.constant 2 : i32
+    %c0_i32 = arith.constant 0 : i32
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %0 = memref.get_global @g1 : memref<5x5xi8>
+    %1 = memref.get_global @g2 : memref<5x5xi8>
+    %alloc = memref.alloc() : memref<5x5xi8>
+    %alloc_0 = memref.alloc() : memref<5x5xi32>
+    %dim = memref.dim %0, %c0 : memref<5x5xi8>
+    %dim_1 = memref.dim %1, %c1 : memref<5x5xi8>
+    %dim_2 = memref.dim %0, %c1 : memref<5x5xi8>
+    scf.for %arg0 = %c0 to %dim step %c1 {
+      scf.for %arg1 = %c0 to %dim_1 step %c1 {
+        memref.store %c0_i32, %alloc_0[%arg0, %arg1] : memref<5x5xi32>
+      }
+    }
+    gemmini.tile_matmul %0 %1 %alloc %alloc_0 : memref<5x5xi8> memref<5x5xi8> memref<5x5xi8> memref<5x5xi32>
+    gemmini.print %alloc : memref<5x5xi8>
+    gemmini.tile_matmul %0 %1 %alloc %alloc_0 {act = 4 : i64, bertScale = 5.000000e-02 : f32} : memref<5x5xi8> memref<5x5xi8> memref<5x5xi8> memref<5x5xi32>
+    gemmini.print %alloc : memref<5x5xi8>
+    return %c0_i8 : i8
+  }
+}
+
+
+// -----// IR Dump After (anonymous namespace)::LowerGemminiToLLVMPass (lower-gemmini) //----- //
+module {
+  llvm.mlir.global internal constant @nl("\0A\00") {addr_space = 0 : i32}
+  llvm.mlir.global internal constant @frmt_spec("%d \00") {addr_space = 0 : i32}
+  llvm.func @printf(!llvm.ptr, ...) -> i32
+  llvm.func @malloc(i64) -> !llvm.ptr
+  llvm.mlir.global private @g1(dense<[[1, 0, 0, 1, 0], [1, -1, 1, 0, 0], [-1, 0, 1, -1, 1], [1, 0, 0, 1, 0], [-1, 0, 0, -1, 0]]> : tensor<5x5xi8>) {addr_space = 0 : i32} : !llvm.array<5 x array<5 x i8>>
+  llvm.mlir.global private @g2(dense<[[1, -1, 0, 0, 1], [1, 0, -1, 0, -1], [-1, -1, 0, -1, 1], [-1, 0, 0, 1, 0], [1, 0, 0, -1, 0]]> : tensor<5x5xi8>) {addr_space = 0 : i32} : !llvm.array<5 x array<5 x i8>>
+  llvm.func @main() -> i8 {
+    %0 = llvm.mlir.constant(0 : i8) : i8
+    %1 = llvm.mlir.constant(1 : i8) : i8
+    %2 = llvm.mlir.constant(-2 : i8) : i8
+    %3 = llvm.mlir.constant(2 : i8) : i8
+    %4 = llvm.mlir.constant(2 : i32) : i32
+    %5 = llvm.mlir.constant(0 : i32) : i32
+    %6 = llvm.mlir.constant(0 : index) : i64
+    %7 = llvm.mlir.constant(1 : index) : i64
+    %8 = llvm.mlir.constant(5 : index) : i64
+    %9 = llvm.mlir.constant(5 : index) : i64
+    %10 = llvm.mlir.constant(1 : index) : i64
+    %11 = llvm.mlir.constant(25 : index) : i64
+    %12 = llvm.mlir.zero : !llvm.ptr
+    %13 = llvm.getelementptr %12[%11] : (!llvm.ptr, i64) -> !llvm.ptr, i8
+    %14 = llvm.ptrtoint %13 : !llvm.ptr to i64
+    %15 = llvm.mlir.addressof @g1 : !llvm.ptr
+    %16 = llvm.getelementptr %15[0, 0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<5 x array<5 x i8>>
+    %17 = llvm.mlir.constant(3735928559 : index) : i64
+    %18 = llvm.inttoptr %17 : i64 to !llvm.ptr
+    %19 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+    %20 = llvm.insertvalue %18, %19[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %21 = llvm.insertvalue %16, %20[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %22 = llvm.mlir.constant(0 : index) : i64
+    %23 = llvm.insertvalue %22, %21[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %24 = llvm.insertvalue %8, %23[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %25 = llvm.insertvalue %9, %24[3, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %26 = llvm.insertvalue %9, %25[4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %27 = llvm.insertvalue %10, %26[4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %28 = llvm.mlir.constant(5 : index) : i64
+    %29 = llvm.mlir.constant(5 : index) : i64
+    %30 = llvm.mlir.constant(1 : index) : i64
+    %31 = llvm.mlir.constant(25 : index) : i64
+    %32 = llvm.mlir.zero : !llvm.ptr
+    %33 = llvm.getelementptr %32[%31] : (!llvm.ptr, i64) -> !llvm.ptr, i8
+    %34 = llvm.ptrtoint %33 : !llvm.ptr to i64
+    %35 = llvm.mlir.addressof @g2 : !llvm.ptr
+    %36 = llvm.getelementptr %35[0, 0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<5 x array<5 x i8>>
+    %37 = llvm.mlir.constant(3735928559 : index) : i64
+    %38 = llvm.inttoptr %37 : i64 to !llvm.ptr
+    %39 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+    %40 = llvm.insertvalue %38, %39[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %41 = llvm.insertvalue %36, %40[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %42 = llvm.mlir.constant(0 : index) : i64
+    %43 = llvm.insertvalue %42, %41[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %44 = llvm.insertvalue %28, %43[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %45 = llvm.insertvalue %29, %44[3, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %46 = llvm.insertvalue %29, %45[4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %47 = llvm.insertvalue %30, %46[4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %48 = llvm.mlir.constant(5 : index) : i64
+    %49 = llvm.mlir.constant(5 : index) : i64
+    %50 = llvm.mlir.constant(1 : index) : i64
+    %51 = llvm.mlir.constant(25 : index) : i64
+    %52 = llvm.mlir.zero : !llvm.ptr
+    %53 = llvm.getelementptr %52[%51] : (!llvm.ptr, i64) -> !llvm.ptr, i8
+    %54 = llvm.ptrtoint %53 : !llvm.ptr to i64
+    %55 = llvm.call @malloc(%54) : (i64) -> !llvm.ptr
+    %56 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+    %57 = llvm.insertvalue %55, %56[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %58 = llvm.insertvalue %55, %57[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %59 = llvm.mlir.constant(0 : index) : i64
+    %60 = llvm.insertvalue %59, %58[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %61 = llvm.insertvalue %48, %60[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %62 = llvm.insertvalue %49, %61[3, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %63 = llvm.insertvalue %49, %62[4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %64 = llvm.insertvalue %50, %63[4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %65 = llvm.mlir.constant(5 : index) : i64
+    %66 = llvm.mlir.constant(5 : index) : i64
+    %67 = llvm.mlir.constant(1 : index) : i64
+    %68 = llvm.mlir.constant(25 : index) : i64
+    %69 = llvm.mlir.zero : !llvm.ptr
+    %70 = llvm.getelementptr %69[%68] : (!llvm.ptr, i64) -> !llvm.ptr, i32
+    %71 = llvm.ptrtoint %70 : !llvm.ptr to i64
+    %72 = llvm.call @malloc(%71) : (i64) -> !llvm.ptr
+    %73 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+    %74 = llvm.insertvalue %72, %73[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %75 = llvm.insertvalue %72, %74[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %76 = llvm.mlir.constant(0 : index) : i64
+    %77 = llvm.insertvalue %76, %75[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %78 = llvm.insertvalue %65, %77[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %79 = llvm.insertvalue %66, %78[3, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %80 = llvm.insertvalue %66, %79[4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %81 = llvm.insertvalue %67, %80[4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %82 = llvm.mlir.constant(5 : index) : i64
+    %83 = llvm.mlir.constant(5 : index) : i64
+    %84 = llvm.mlir.constant(5 : index) : i64
+    llvm.br ^bb1(%6 : i64)
+  ^bb1(%85: i64):  // 2 preds: ^bb0, ^bb5
+    %86 = llvm.icmp "slt" %85, %82 : i64
+    llvm.cond_br %86, ^bb2, ^bb6
+  ^bb2:  // pred: ^bb1
+    llvm.br ^bb3(%6 : i64)
+  ^bb3(%87: i64):  // 2 preds: ^bb2, ^bb4
+    %88 = llvm.icmp "slt" %87, %83 : i64
+    llvm.cond_br %88, ^bb4, ^bb5
+  ^bb4:  // pred: ^bb3
+    %89 = llvm.extractvalue %81[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %90 = llvm.mlir.constant(5 : index) : i64
+    %91 = llvm.mul %85, %90 : i64
+    %92 = llvm.add %91, %87 : i64
+    %93 = llvm.getelementptr %89[%92] : (!llvm.ptr, i64) -> !llvm.ptr, i32
+    llvm.store %5, %93 : i32, !llvm.ptr
+    %94 = llvm.add %87, %7 : i64
+    llvm.br ^bb3(%94 : i64)
+  ^bb5:  // pred: ^bb3
+    %95 = llvm.add %85, %7 : i64
+    llvm.br ^bb1(%95 : i64)
+  ^bb6:  // pred: ^bb1
+    %96 = llvm.extractvalue %27[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %97 = llvm.ptrtoint %96 : !llvm.ptr to i64
+    %98 = llvm.extractvalue %47[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %99 = llvm.ptrtoint %98 : !llvm.ptr to i64
+    %100 = llvm.extractvalue %64[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %101 = llvm.ptrtoint %100 : !llvm.ptr to i64
+    %102 = llvm.extractvalue %81[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %103 = llvm.ptrtoint %102 : !llvm.ptr to i64
+    %104 = llvm.mlir.constant(4575657221408489476 : i64) : i64
+    %105 = llvm.mlir.constant(281474976710656 : i64) : i64
+    "gemmini.intr.config_ex"(%104, %105) : (i64, i64) -> ()
+    %106 = llvm.mlir.constant(5 : i64) : i64
+    %107 = llvm.mlir.constant(2 : i64) : i64
+    %108 = llvm.mlir.constant(4575657221408423941 : i64) : i64
+    "gemmini.intr.config_st"(%107, %108) : (i64, i64) -> ()
+    %109 = llvm.mlir.constant(5 : i64) : i64
+    %110 = llvm.mlir.constant(4575657221409472769 : i64) : i64
+    "gemmini.intr.config_ld"(%110, %109) : (i64, i64) -> ()
+    %111 = llvm.mlir.constant(5 : i64) : i64
+    %112 = llvm.mlir.constant(4575657221409472777 : i64) : i64
+    "gemmini.intr.config_ld"(%112, %111) : (i64, i64) -> ()
+    %113 = llvm.mlir.constant(20 : i64) : i64
+    %114 = llvm.mlir.constant(4575657221409472785 : i64) : i64
+    "gemmini.intr.config_ld"(%114, %113) : (i64, i64) -> ()
+    %115 = llvm.mlir.constant(0 : i64) : i64
+    %116 = llvm.mlir.constant(0 : i64) : i64
+    %117 = llvm.mlir.constant(0 : i64) : i64
+    %118 = llvm.mlir.constant(0 : i64) : i64
+    %119 = llvm.mlir.constant(47245361163 : i64) : i64
+    %120 = llvm.mlir.constant(4295032833 : i64) : i64
+    "gemmini.intr.loop_ws_config_bounds"(%119, %120) : (i64, i64) -> ()
+    "gemmini.intr.loop_ws_config_addrs_ab"(%97, %99) : (i64, i64) -> ()
+    "gemmini.intr.loop_ws_config_addrs_dc"(%103, %101) : (i64, i64) -> ()
+    %121 = llvm.mlir.constant(5 : i64) : i64
+    %122 = llvm.mlir.constant(5 : i64) : i64
+    "gemmini.intr.loop_ws_config_strides_ab"(%121, %122) : (i64, i64) -> ()
+    %123 = llvm.mlir.constant(5 : i64) : i64
+    %124 = llvm.mlir.constant(5 : i64) : i64
+    "gemmini.intr.loop_ws_config_strides_dc"(%123, %124) : (i64, i64) -> ()
+    %125 = llvm.mlir.constant(1 : i64) : i64
+    %126 = llvm.mlir.constant(0 : i64) : i64
+    "gemmini.intr.loop_ws"(%125, %126) : (i64, i64) -> ()
+    %127 = llvm.mlir.constant(0 : i64) : i64
+    "gemmini.intr.flush"(%127, %127) : (i64, i64) -> ()
+    %128 = llvm.mlir.addressof @frmt_spec : !llvm.ptr
+    %129 = llvm.mlir.constant(0 : index) : i64
+    %130 = llvm.getelementptr %128[%129, %129] : (!llvm.ptr, i64, i64) -> !llvm.ptr, !llvm.array<4 x i8>
+    %131 = llvm.mlir.addressof @nl : !llvm.ptr
+    %132 = llvm.mlir.constant(0 : index) : i64
+    %133 = llvm.getelementptr %131[%132, %132] : (!llvm.ptr, i64, i64) -> !llvm.ptr, !llvm.array<2 x i8>
+    %134 = llvm.mlir.constant(0 : index) : i64
+    %135 = llvm.mlir.constant(5 : index) : i64
+    %136 = llvm.mlir.constant(1 : index) : i64
+    llvm.br ^bb7(%134 : i64)
+  ^bb7(%137: i64):  // 2 preds: ^bb6, ^bb11
+    %138 = llvm.icmp "slt" %137, %135 : i64
+    llvm.cond_br %138, ^bb8, ^bb12
+  ^bb8:  // pred: ^bb7
+    %139 = llvm.mlir.constant(0 : index) : i64
+    %140 = llvm.mlir.constant(5 : index) : i64
+    %141 = llvm.mlir.constant(1 : index) : i64
+    llvm.br ^bb9(%139 : i64)
+  ^bb9(%142: i64):  // 2 preds: ^bb8, ^bb10
+    %143 = llvm.icmp "slt" %142, %140 : i64
+    llvm.cond_br %143, ^bb10, ^bb11
+  ^bb10:  // pred: ^bb9
+    %144 = llvm.extractvalue %64[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %145 = llvm.mlir.constant(5 : index) : i64
+    %146 = llvm.mul %137, %145 : i64
+    %147 = llvm.add %146, %142 : i64
+    %148 = llvm.getelementptr %144[%147] : (!llvm.ptr, i64) -> !llvm.ptr, i8
+    %149 = llvm.load %148 : !llvm.ptr -> i8
+    %150 = llvm.sext %149 : i8 to i32
+    %151 = llvm.call @printf(%130, %150) vararg(!llvm.func<i32 (ptr, ...)>) : (!llvm.ptr, i32) -> i32
+    %152 = llvm.add %142, %141 : i64
+    llvm.br ^bb9(%152 : i64)
+  ^bb11:  // pred: ^bb9
+    %153 = llvm.call @printf(%133) vararg(!llvm.func<i32 (ptr, ...)>) : (!llvm.ptr) -> i32
+    %154 = llvm.add %137, %136 : i64
+    llvm.br ^bb7(%154 : i64)
+  ^bb12:  // pred: ^bb7
+    %155 = llvm.extractvalue %27[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %156 = llvm.ptrtoint %155 : !llvm.ptr to i64
+    %157 = llvm.extractvalue %47[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %158 = llvm.ptrtoint %157 : !llvm.ptr to i64
+    %159 = llvm.extractvalue %64[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %160 = llvm.ptrtoint %159 : !llvm.ptr to i64
+    %161 = llvm.extractvalue %81[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %162 = llvm.ptrtoint %161 : !llvm.ptr to i64
+    %163 = llvm.mlir.constant(4575657221408489476 : i64) : i64
+    %164 = llvm.mlir.constant(281474976710656 : i64) : i64
+    "gemmini.intr.config_ex"(%163, %164) : (i64, i64) -> ()
+    %165 = llvm.mlir.constant(5 : i64) : i64
+    %166 = llvm.mlir.constant(2 : i64) : i64
+    %167 = llvm.mlir.constant(4575657221408423941 : i64) : i64
+    "gemmini.intr.config_st"(%166, %167) : (i64, i64) -> ()
+    %168 = llvm.mlir.constant(5 : i64) : i64
+    %169 = llvm.mlir.constant(4575657221409472769 : i64) : i64
+    "gemmini.intr.config_ld"(%169, %168) : (i64, i64) -> ()
+    %170 = llvm.mlir.constant(5 : i64) : i64
+    %171 = llvm.mlir.constant(4575657221409472777 : i64) : i64
+    "gemmini.intr.config_ld"(%171, %170) : (i64, i64) -> ()
+    %172 = llvm.mlir.constant(20 : i64) : i64
+    %173 = llvm.mlir.constant(4575657221409472785 : i64) : i64
+    "gemmini.intr.config_ld"(%173, %172) : (i64, i64) -> ()
+    %174 = llvm.mlir.constant(55834640387 : i64) : i64
+    %175 = llvm.mlir.constant(1644972474395 : i64) : i64
+    "gemmini.intr.config_norm"(%174, %175) : (i64, i64) -> ()
+    %176 = llvm.mlir.constant(21650930466819 : i64) : i64
+    %177 = llvm.mlir.constant(1644972474395 : i64) : i64
+    "gemmini.intr.config_norm"(%176, %177) : (i64, i64) -> ()
+    %178 = llvm.mlir.constant(0 : i64) : i64
+    %179 = llvm.mlir.constant(0 : i64) : i64
+    %180 = llvm.mlir.constant(0 : i64) : i64
+    %181 = llvm.mlir.constant(0 : i64) : i64
+    %182 = llvm.mlir.constant(47245361163 : i64) : i64
+    %183 = llvm.mlir.constant(4295032833 : i64) : i64
+    "gemmini.intr.loop_ws_config_bounds"(%182, %183) : (i64, i64) -> ()
+    "gemmini.intr.loop_ws_config_addrs_ab"(%156, %158) : (i64, i64) -> ()
+    "gemmini.intr.loop_ws_config_addrs_dc"(%162, %160) : (i64, i64) -> ()
+    %184 = llvm.mlir.constant(5 : i64) : i64
+    %185 = llvm.mlir.constant(5 : i64) : i64
+    "gemmini.intr.loop_ws_config_strides_ab"(%184, %185) : (i64, i64) -> ()
+    %186 = llvm.mlir.constant(5 : i64) : i64
+    %187 = llvm.mlir.constant(5 : i64) : i64
+    "gemmini.intr.loop_ws_config_strides_dc"(%186, %187) : (i64, i64) -> ()
+    %188 = llvm.mlir.constant(1025 : i64) : i64
+    %189 = llvm.mlir.constant(0 : i64) : i64
+    "gemmini.intr.loop_ws"(%188, %189) : (i64, i64) -> ()
+    %190 = llvm.mlir.constant(0 : i64) : i64
+    "gemmini.intr.flush"(%190, %190) : (i64, i64) -> ()
+    %191 = llvm.mlir.addressof @frmt_spec : !llvm.ptr
+    %192 = llvm.mlir.constant(0 : index) : i64
+    %193 = llvm.getelementptr %191[%192, %192] : (!llvm.ptr, i64, i64) -> !llvm.ptr, !llvm.array<4 x i8>
+    %194 = llvm.mlir.addressof @nl : !llvm.ptr
+    %195 = llvm.mlir.constant(0 : index) : i64
+    %196 = llvm.getelementptr %194[%195, %195] : (!llvm.ptr, i64, i64) -> !llvm.ptr, !llvm.array<2 x i8>
+    %197 = llvm.mlir.constant(0 : index) : i64
+    %198 = llvm.mlir.constant(5 : index) : i64
+    %199 = llvm.mlir.constant(1 : index) : i64
+    llvm.br ^bb13(%197 : i64)
+  ^bb13(%200: i64):  // 2 preds: ^bb12, ^bb17
+    %201 = llvm.icmp "slt" %200, %198 : i64
+    llvm.cond_br %201, ^bb14, ^bb18
+  ^bb14:  // pred: ^bb13
+    %202 = llvm.mlir.constant(0 : index) : i64
+    %203 = llvm.mlir.constant(5 : index) : i64
+    %204 = llvm.mlir.constant(1 : index) : i64
+    llvm.br ^bb15(%202 : i64)
+  ^bb15(%205: i64):  // 2 preds: ^bb14, ^bb16
+    %206 = llvm.icmp "slt" %205, %203 : i64
+    llvm.cond_br %206, ^bb16, ^bb17
+  ^bb16:  // pred: ^bb15
+    %207 = llvm.extractvalue %64[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %208 = llvm.mlir.constant(5 : index) : i64
+    %209 = llvm.mul %200, %208 : i64
+    %210 = llvm.add %209, %205 : i64
+    %211 = llvm.getelementptr %207[%210] : (!llvm.ptr, i64) -> !llvm.ptr, i8
+    %212 = llvm.load %211 : !llvm.ptr -> i8
+    %213 = llvm.sext %212 : i8 to i32
+    %214 = llvm.call @printf(%193, %213) vararg(!llvm.func<i32 (ptr, ...)>) : (!llvm.ptr, i32) -> i32
+    %215 = llvm.add %205, %204 : i64
+    llvm.br ^bb15(%215 : i64)
+  ^bb17:  // pred: ^bb15
+    %216 = llvm.call @printf(%196) vararg(!llvm.func<i32 (ptr, ...)>) : (!llvm.ptr) -> i32
+    %217 = llvm.add %200, %199 : i64
+    llvm.br ^bb13(%217 : i64)
+  ^bb18:  // pred: ^bb13
+    llvm.return %0 : i8
+  }
+}
+
+

From 3221999330ec41bc2907d92388995e51881071f8 Mon Sep 17 00:00:00 2001
From: sparsh <sparshsingh@berkeley.edu>
Date: Tue, 9 Dec 2025 02:51:47 -0800
Subject: [PATCH 05/13] Document libgemmini build status on Mac

---
 experiments/gemmini/libgemmini_status.txt | 13 +++++++++++++
 1 file changed, 13 insertions(+)
 create mode 100644 experiments/gemmini/libgemmini_status.txt

diff --git a/experiments/gemmini/libgemmini_status.txt b/experiments/gemmini/libgemmini_status.txt
new file mode 100644
index 0000000..170f8b0
--- /dev/null
+++ b/experiments/gemmini/libgemmini_status.txt
@@ -0,0 +1,13 @@
+Mac setup notes:
+
+- buddy-mlir Gemmini lowering works (matmul, batch_matmul, conv, matmul+softmax).
+- Generated LLVM IR (log.ll) and RISC-V asm (log.s) via Makefile targets, asm has Gemmini ops (config_ex, config_st, loop_ws, etc.).
+- Spike + pk + riscv64-unknown-elf-gcc work for a simple "hello" test.
+
+Blocked on:
+- Installing libgemmini (Spike extension) from https://github.com/ucb-bar/libgemmini.
+- `make libgemmini.so` fails on macOS with `ld: symbol(s) not found for architecture arm64` and RISCV-dependent paths that assume a full Chipyard/Gemmini tree.
+
+Plan:
+- Use Mac primarily for IR/pipeline experiments.
+- Do Spike+Gemmini execution on a SLICE Linux machine with Chipyard/Gemmini installed.

From 48ff9b047d622888fa240eedef5f40942ed7f4a7 Mon Sep 17 00:00:00 2001
From: sparsh <sparshsingh@berkeley.edu>
Date: Thu, 11 Dec 2025 15:40:40 -0800
Subject: [PATCH 06/13] Buddy Gemmini: conv2d block lowering (NHWC x HWCF)

---
 .../logs/conv2d_block1.print-after-all.mlir   | 693 ++++++++++++++++++
 .../gemmini/networks/conv2d_block1.mlir       |  13 +
 2 files changed, 706 insertions(+)
 create mode 100644 experiments/gemmini/logs/conv2d_block1.print-after-all.mlir
 create mode 100644 experiments/gemmini/networks/conv2d_block1.mlir

diff --git a/experiments/gemmini/logs/conv2d_block1.print-after-all.mlir b/experiments/gemmini/logs/conv2d_block1.print-after-all.mlir
new file mode 100644
index 0000000..e22f289
--- /dev/null
+++ b/experiments/gemmini/logs/conv2d_block1.print-after-all.mlir
@@ -0,0 +1,693 @@
+// -----// IR Dump After (anonymous namespace)::LowerLinalgToGemminiPass (convert-linalg-to-gemmini) //----- //
+module {
+  func.func @conv2d_block1(%arg0: memref<1x32x32x32xf16>, %arg1: memref<3x3x32x64xf16>, %arg2: memref<1x30x30x64xf32>) {
+    %alloc = memref.alloc() : memref<288x64xf16>
+    %alloc_0 = memref.alloc() : memref<900x64xf32>
+    %alloc_1 = memref.alloc() : memref<64xi32>
+    %c0_i32 = arith.constant 0 : i32
+    linalg.fill ins(%c0_i32 : i32) outs(%alloc_1 : memref<64xi32>)
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c3 = arith.constant 3 : index
+    scf.for %arg3 = %c0 to %c3 step %c1 {
+      %c3_3 = arith.constant 3 : index
+      scf.for %arg4 = %c0 to %c3_3 step %c1 {
+        %c32 = arith.constant 32 : index
+        scf.for %arg5 = %c0 to %c32 step %c1 {
+          %c64 = arith.constant 64 : index
+          scf.for %arg6 = %c0 to %c64 step %c1 {
+            %c3_4 = arith.constant 3 : index
+            %c32_5 = arith.constant 32 : index
+            %0 = arith.muli %arg3, %c3_4 : index
+            %1 = arith.muli %0, %c32_5 : index
+            %2 = arith.muli %arg4, %c32_5 : index
+            %3 = arith.addi %1, %2 : index
+            %4 = arith.addi %3, %arg5 : index
+            %5 = memref.load %arg1[%arg3, %arg4, %arg5, %arg6] : memref<3x3x32x64xf16>
+            memref.store %5, %alloc[%4, %arg6] : memref<288x64xf16>
+          }
+        }
+      }
+    }
+    %c30_i64 = arith.constant 30 : i64
+    %c3_i64 = arith.constant 3 : i64
+    gemmini.tile_conv %arg0 %alloc %alloc_1 %alloc_0 %c30_i64 %c30_i64 %c3_i64 : memref<1x32x32x32xf16> memref<288x64xf16> memref<64xi32> memref<900x64xf32> i64 i64 i64
+    %c1_2 = arith.constant 1 : index
+    scf.for %arg3 = %c0 to %c1_2 step %c1 {
+      %c30 = arith.constant 30 : index
+      scf.for %arg4 = %c0 to %c30 step %c1 {
+        %c30_3 = arith.constant 30 : index
+        scf.for %arg5 = %c0 to %c30_3 step %c1 {
+          %c64 = arith.constant 64 : index
+          scf.for %arg6 = %c0 to %c64 step %c1 {
+            %c30_4 = arith.constant 30 : index
+            %0 = arith.muli %arg3, %c30_4 : index
+            %1 = arith.muli %0, %c30_4 : index
+            %2 = arith.muli %c30_4, %arg4 : index
+            %3 = arith.addi %1, %2 : index
+            %4 = arith.addi %3, %arg5 : index
+            %5 = memref.load %alloc_0[%4, %arg6] : memref<900x64xf32>
+            memref.store %5, %arg2[%arg3, %arg4, %arg5, %arg6] : memref<1x30x30x64xf32>
+          }
+        }
+      }
+    }
+    memref.dealloc %alloc : memref<288x64xf16>
+    memref.dealloc %alloc_0 : memref<900x64xf32>
+    memref.dealloc %alloc_1 : memref<64xi32>
+    return
+  }
+}
+
+
+// -----// IR Dump After (anonymous namespace)::LowerGemminiToLLVMPass (lower-gemmini) //----- //
+module {
+  llvm.func @free(!llvm.ptr)
+  llvm.func @malloc(i64) -> !llvm.ptr
+  llvm.func @conv2d_block1(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr, %arg23: !llvm.ptr, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64) {
+    %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
+    %1 = llvm.insertvalue %arg22, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %2 = llvm.insertvalue %arg23, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %3 = llvm.insertvalue %arg24, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %4 = llvm.insertvalue %arg25, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %5 = llvm.insertvalue %arg29, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %6 = llvm.insertvalue %arg26, %5[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %7 = llvm.insertvalue %arg30, %6[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %8 = llvm.insertvalue %arg27, %7[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %9 = llvm.insertvalue %arg31, %8[4, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %10 = llvm.insertvalue %arg28, %9[3, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %11 = llvm.insertvalue %arg32, %10[4, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %12 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
+    %13 = llvm.insertvalue %arg0, %12[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %14 = llvm.insertvalue %arg1, %13[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %15 = llvm.insertvalue %arg2, %14[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %16 = llvm.insertvalue %arg3, %15[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %17 = llvm.insertvalue %arg7, %16[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %18 = llvm.insertvalue %arg4, %17[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %19 = llvm.insertvalue %arg8, %18[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %20 = llvm.insertvalue %arg5, %19[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %21 = llvm.insertvalue %arg9, %20[4, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %22 = llvm.insertvalue %arg6, %21[3, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %23 = llvm.insertvalue %arg10, %22[4, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %24 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
+    %25 = llvm.insertvalue %arg11, %24[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %26 = llvm.insertvalue %arg12, %25[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %27 = llvm.insertvalue %arg13, %26[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %28 = llvm.insertvalue %arg14, %27[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %29 = llvm.insertvalue %arg18, %28[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %30 = llvm.insertvalue %arg15, %29[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %31 = llvm.insertvalue %arg19, %30[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %32 = llvm.insertvalue %arg16, %31[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %33 = llvm.insertvalue %arg20, %32[4, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %34 = llvm.insertvalue %arg17, %33[3, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %35 = llvm.insertvalue %arg21, %34[4, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %36 = llvm.mlir.constant(288 : index) : i64
+    %37 = llvm.mlir.constant(64 : index) : i64
+    %38 = llvm.mlir.constant(1 : index) : i64
+    %39 = llvm.mlir.constant(18432 : index) : i64
+    %40 = llvm.mlir.zero : !llvm.ptr
+    %41 = llvm.getelementptr %40[%39] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+    %42 = llvm.ptrtoint %41 : !llvm.ptr to i64
+    %43 = llvm.call @malloc(%42) : (i64) -> !llvm.ptr
+    %44 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+    %45 = llvm.insertvalue %43, %44[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %46 = llvm.insertvalue %43, %45[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %47 = llvm.mlir.constant(0 : index) : i64
+    %48 = llvm.insertvalue %47, %46[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %49 = llvm.insertvalue %36, %48[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %50 = llvm.insertvalue %37, %49[3, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %51 = llvm.insertvalue %37, %50[4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %52 = llvm.insertvalue %38, %51[4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %53 = llvm.mlir.constant(900 : index) : i64
+    %54 = llvm.mlir.constant(64 : index) : i64
+    %55 = llvm.mlir.constant(1 : index) : i64
+    %56 = llvm.mlir.constant(57600 : index) : i64
+    %57 = llvm.mlir.zero : !llvm.ptr
+    %58 = llvm.getelementptr %57[%56] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+    %59 = llvm.ptrtoint %58 : !llvm.ptr to i64
+    %60 = llvm.call @malloc(%59) : (i64) -> !llvm.ptr
+    %61 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+    %62 = llvm.insertvalue %60, %61[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %63 = llvm.insertvalue %60, %62[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %64 = llvm.mlir.constant(0 : index) : i64
+    %65 = llvm.insertvalue %64, %63[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %66 = llvm.insertvalue %53, %65[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %67 = llvm.insertvalue %54, %66[3, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %68 = llvm.insertvalue %54, %67[4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %69 = llvm.insertvalue %55, %68[4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %70 = llvm.mlir.constant(64 : index) : i64
+    %71 = llvm.mlir.constant(1 : index) : i64
+    %72 = llvm.mlir.zero : !llvm.ptr
+    %73 = llvm.getelementptr %72[%70] : (!llvm.ptr, i64) -> !llvm.ptr, i32
+    %74 = llvm.ptrtoint %73 : !llvm.ptr to i64
+    %75 = llvm.call @malloc(%74) : (i64) -> !llvm.ptr
+    %76 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+    %77 = llvm.insertvalue %75, %76[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
+    %78 = llvm.insertvalue %75, %77[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
+    %79 = llvm.mlir.constant(0 : index) : i64
+    %80 = llvm.insertvalue %79, %78[2] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
+    %81 = llvm.insertvalue %70, %80[3, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
+    %82 = llvm.insertvalue %71, %81[4, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
+    %83 = builtin.unrealized_conversion_cast %82 : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> to memref<64xi32>
+    %84 = llvm.mlir.constant(0 : i32) : i32
+    linalg.fill ins(%84 : i32) outs(%83 : memref<64xi32>)
+    %85 = llvm.mlir.constant(0 : index) : i64
+    %86 = llvm.mlir.constant(1 : index) : i64
+    %87 = llvm.mlir.constant(3 : index) : i64
+    llvm.br ^bb1(%85 : i64)
+  ^bb1(%88: i64):  // 2 preds: ^bb0, ^bb11
+    %89 = llvm.icmp "slt" %88, %87 : i64
+    llvm.cond_br %89, ^bb2, ^bb12
+  ^bb2:  // pred: ^bb1
+    %90 = llvm.mlir.constant(3 : index) : i64
+    llvm.br ^bb3(%85 : i64)
+  ^bb3(%91: i64):  // 2 preds: ^bb2, ^bb10
+    %92 = llvm.icmp "slt" %91, %90 : i64
+    llvm.cond_br %92, ^bb4, ^bb11
+  ^bb4:  // pred: ^bb3
+    %93 = llvm.mlir.constant(32 : index) : i64
+    llvm.br ^bb5(%85 : i64)
+  ^bb5(%94: i64):  // 2 preds: ^bb4, ^bb9
+    %95 = llvm.icmp "slt" %94, %93 : i64
+    llvm.cond_br %95, ^bb6, ^bb10
+  ^bb6:  // pred: ^bb5
+    %96 = llvm.mlir.constant(64 : index) : i64
+    llvm.br ^bb7(%85 : i64)
+  ^bb7(%97: i64):  // 2 preds: ^bb6, ^bb8
+    %98 = llvm.icmp "slt" %97, %96 : i64
+    llvm.cond_br %98, ^bb8, ^bb9
+  ^bb8:  // pred: ^bb7
+    %99 = llvm.mlir.constant(3 : index) : i64
+    %100 = llvm.mlir.constant(32 : index) : i64
+    %101 = llvm.mul %88, %99 : i64
+    %102 = llvm.mul %101, %100 : i64
+    %103 = llvm.mul %91, %100 : i64
+    %104 = llvm.add %102, %103 : i64
+    %105 = llvm.add %104, %94 : i64
+    %106 = llvm.extractvalue %35[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %107 = llvm.mlir.constant(6144 : index) : i64
+    %108 = llvm.mul %88, %107 : i64
+    %109 = llvm.mlir.constant(2048 : index) : i64
+    %110 = llvm.mul %91, %109 : i64
+    %111 = llvm.add %108, %110 : i64
+    %112 = llvm.mlir.constant(64 : index) : i64
+    %113 = llvm.mul %94, %112 : i64
+    %114 = llvm.add %111, %113 : i64
+    %115 = llvm.add %114, %97 : i64
+    %116 = llvm.getelementptr %106[%115] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+    %117 = llvm.load %116 : !llvm.ptr -> f16
+    %118 = llvm.extractvalue %52[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %119 = llvm.mlir.constant(64 : index) : i64
+    %120 = llvm.mul %105, %119 : i64
+    %121 = llvm.add %120, %97 : i64
+    %122 = llvm.getelementptr %118[%121] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+    llvm.store %117, %122 : f16, !llvm.ptr
+    %123 = llvm.add %97, %86 : i64
+    llvm.br ^bb7(%123 : i64)
+  ^bb9:  // pred: ^bb7
+    %124 = llvm.add %94, %86 : i64
+    llvm.br ^bb5(%124 : i64)
+  ^bb10:  // pred: ^bb5
+    %125 = llvm.add %91, %86 : i64
+    llvm.br ^bb3(%125 : i64)
+  ^bb11:  // pred: ^bb3
+    %126 = llvm.add %88, %86 : i64
+    llvm.br ^bb1(%126 : i64)
+  ^bb12:  // pred: ^bb1
+    %127 = llvm.mlir.constant(30 : i64) : i64
+    %128 = llvm.mlir.constant(3 : i64) : i64
+    %129 = llvm.extractvalue %23[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %130 = llvm.ptrtoint %129 : !llvm.ptr to i64
+    %131 = llvm.extractvalue %69[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %132 = llvm.ptrtoint %131 : !llvm.ptr to i64
+    %133 = llvm.extractvalue %82[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
+    %134 = llvm.ptrtoint %133 : !llvm.ptr to i64
+    %135 = llvm.extractvalue %52[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %136 = llvm.ptrtoint %135 : !llvm.ptr to i64
+    %137 = llvm.mlir.constant(64 : i64) : i64
+    %138 = llvm.mlir.constant(2 : i64) : i64
+    %139 = llvm.mlir.constant(4575657221408424000 : i64) : i64
+    "gemmini.intr.config_st"(%138, %139) : (i64, i64) -> ()
+    %140 = llvm.mlir.constant(65540 : i64) : i64
+    %141 = llvm.mlir.constant(281474976710656 : i64) : i64
+    "gemmini.intr.config_ex"(%140, %141) : (i64, i64) -> ()
+    %142 = llvm.mlir.constant(0 : i64) : i64
+    %143 = llvm.mlir.constant(0 : i64) : i64
+    %144 = llvm.mlir.constant(0 : i64) : i64
+    %145 = llvm.mlir.constant(0 : i64) : i64
+    %146 = llvm.mlir.constant(18014535950532609 : i64) : i64
+    %147 = llvm.mlir.constant(4296933406 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config1"(%146, %147) : (i64, i64) -> ()
+    %148 = llvm.mlir.constant(844429225164800 : i64) : i64
+    %149 = llvm.mlir.constant(281569467498512 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config2"(%148, %149) : (i64, i64) -> ()
+    %150 = llvm.mlir.constant(844437817131008 : i64) : i64
+    %151 = llvm.mlir.constant(0 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config3"(%150, %151) : (i64, i64) -> ()
+    %152 = llvm.mlir.constant(6192449487634432 : i64) : i64
+    %153 = llvm.mlir.constant(65559 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config4"(%152, %153) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config5"(%136, %132) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config6"(%134, %130) : (i64, i64) -> ()
+    %154 = llvm.mlir.constant(256 : i64) : i64
+    %155 = llvm.mlir.constant(1 : i64) : i64
+    "gemmini.intr.loop_conv_ws"(%154, %155) : (i64, i64) -> ()
+    %156 = llvm.mlir.constant(16 : i64) : i64
+    %157 = llvm.add %132, %156 : i64
+    %158 = llvm.mlir.constant(64 : i64) : i64
+    %159 = llvm.add %134, %158 : i64
+    %160 = llvm.mlir.constant(16 : i64) : i64
+    %161 = llvm.add %136, %160 : i64
+    %162 = llvm.mlir.constant(0 : i64) : i64
+    %163 = llvm.mlir.constant(18014535950532609 : i64) : i64
+    %164 = llvm.mlir.constant(4296933406 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config1"(%163, %164) : (i64, i64) -> ()
+    %165 = llvm.mlir.constant(844429225164800 : i64) : i64
+    %166 = llvm.mlir.constant(281569467498512 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config2"(%165, %166) : (i64, i64) -> ()
+    %167 = llvm.mlir.constant(844437817131008 : i64) : i64
+    %168 = llvm.mlir.constant(0 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config3"(%167, %168) : (i64, i64) -> ()
+    %169 = llvm.mlir.constant(6192449487634432 : i64) : i64
+    %170 = llvm.mlir.constant(65559 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config4"(%169, %170) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config5"(%161, %157) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config6"(%159, %130) : (i64, i64) -> ()
+    %171 = llvm.mlir.constant(256 : i64) : i64
+    %172 = llvm.mlir.constant(1 : i64) : i64
+    "gemmini.intr.loop_conv_ws"(%171, %172) : (i64, i64) -> ()
+    %173 = llvm.mlir.constant(32 : i64) : i64
+    %174 = llvm.add %132, %173 : i64
+    %175 = llvm.mlir.constant(128 : i64) : i64
+    %176 = llvm.add %134, %175 : i64
+    %177 = llvm.mlir.constant(32 : i64) : i64
+    %178 = llvm.add %136, %177 : i64
+    %179 = llvm.mlir.constant(0 : i64) : i64
+    %180 = llvm.mlir.constant(18014535950532609 : i64) : i64
+    %181 = llvm.mlir.constant(4296933406 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config1"(%180, %181) : (i64, i64) -> ()
+    %182 = llvm.mlir.constant(844429225164800 : i64) : i64
+    %183 = llvm.mlir.constant(281569467498512 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config2"(%182, %183) : (i64, i64) -> ()
+    %184 = llvm.mlir.constant(844437817131008 : i64) : i64
+    %185 = llvm.mlir.constant(0 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config3"(%184, %185) : (i64, i64) -> ()
+    %186 = llvm.mlir.constant(6192449487634432 : i64) : i64
+    %187 = llvm.mlir.constant(65559 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config4"(%186, %187) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config5"(%178, %174) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config6"(%176, %130) : (i64, i64) -> ()
+    %188 = llvm.mlir.constant(256 : i64) : i64
+    %189 = llvm.mlir.constant(1 : i64) : i64
+    "gemmini.intr.loop_conv_ws"(%188, %189) : (i64, i64) -> ()
+    %190 = llvm.mlir.constant(48 : i64) : i64
+    %191 = llvm.add %132, %190 : i64
+    %192 = llvm.mlir.constant(192 : i64) : i64
+    %193 = llvm.add %134, %192 : i64
+    %194 = llvm.mlir.constant(48 : i64) : i64
+    %195 = llvm.add %136, %194 : i64
+    %196 = llvm.mlir.constant(0 : i64) : i64
+    %197 = llvm.mlir.constant(18014535950532609 : i64) : i64
+    %198 = llvm.mlir.constant(4296933406 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config1"(%197, %198) : (i64, i64) -> ()
+    %199 = llvm.mlir.constant(844429225164800 : i64) : i64
+    %200 = llvm.mlir.constant(281569467498512 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config2"(%199, %200) : (i64, i64) -> ()
+    %201 = llvm.mlir.constant(844437817131008 : i64) : i64
+    %202 = llvm.mlir.constant(0 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config3"(%201, %202) : (i64, i64) -> ()
+    %203 = llvm.mlir.constant(6192449487634432 : i64) : i64
+    %204 = llvm.mlir.constant(65559 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config4"(%203, %204) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config5"(%195, %191) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config6"(%193, %130) : (i64, i64) -> ()
+    %205 = llvm.mlir.constant(256 : i64) : i64
+    %206 = llvm.mlir.constant(1 : i64) : i64
+    "gemmini.intr.loop_conv_ws"(%205, %206) : (i64, i64) -> ()
+    %207 = llvm.mlir.constant(1472 : i64) : i64
+    %208 = llvm.add %132, %207 : i64
+    %209 = llvm.mlir.constant(0 : i64) : i64
+    %210 = llvm.mlir.constant(0 : i64) : i64
+    %211 = llvm.mlir.constant(736 : i64) : i64
+    %212 = llvm.add %130, %211 : i64
+    %213 = llvm.mlir.constant(18014535950532609 : i64) : i64
+    %214 = llvm.mlir.constant(4296933406 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config1"(%213, %214) : (i64, i64) -> ()
+    %215 = llvm.mlir.constant(844429225164800 : i64) : i64
+    %216 = llvm.mlir.constant(281569466449936 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config2"(%215, %216) : (i64, i64) -> ()
+    %217 = llvm.mlir.constant(844437817131008 : i64) : i64
+    %218 = llvm.mlir.constant(0 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config3"(%217, %218) : (i64, i64) -> ()
+    %219 = llvm.mlir.constant(6192449487634432 : i64) : i64
+    %220 = llvm.mlir.constant(65543 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config4"(%219, %220) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config5"(%136, %208) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config6"(%134, %212) : (i64, i64) -> ()
+    %221 = llvm.mlir.constant(256 : i64) : i64
+    %222 = llvm.mlir.constant(1 : i64) : i64
+    "gemmini.intr.loop_conv_ws"(%221, %222) : (i64, i64) -> ()
+    %223 = llvm.mlir.constant(1488 : i64) : i64
+    %224 = llvm.add %132, %223 : i64
+    %225 = llvm.mlir.constant(64 : i64) : i64
+    %226 = llvm.add %134, %225 : i64
+    %227 = llvm.mlir.constant(16 : i64) : i64
+    %228 = llvm.add %136, %227 : i64
+    %229 = llvm.mlir.constant(736 : i64) : i64
+    %230 = llvm.add %130, %229 : i64
+    %231 = llvm.mlir.constant(18014535950532609 : i64) : i64
+    %232 = llvm.mlir.constant(4296933406 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config1"(%231, %232) : (i64, i64) -> ()
+    %233 = llvm.mlir.constant(844429225164800 : i64) : i64
+    %234 = llvm.mlir.constant(281569466449936 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config2"(%233, %234) : (i64, i64) -> ()
+    %235 = llvm.mlir.constant(844437817131008 : i64) : i64
+    %236 = llvm.mlir.constant(0 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config3"(%235, %236) : (i64, i64) -> ()
+    %237 = llvm.mlir.constant(6192449487634432 : i64) : i64
+    %238 = llvm.mlir.constant(65543 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config4"(%237, %238) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config5"(%228, %224) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config6"(%226, %230) : (i64, i64) -> ()
+    %239 = llvm.mlir.constant(256 : i64) : i64
+    %240 = llvm.mlir.constant(1 : i64) : i64
+    "gemmini.intr.loop_conv_ws"(%239, %240) : (i64, i64) -> ()
+    %241 = llvm.mlir.constant(1504 : i64) : i64
+    %242 = llvm.add %132, %241 : i64
+    %243 = llvm.mlir.constant(128 : i64) : i64
+    %244 = llvm.add %134, %243 : i64
+    %245 = llvm.mlir.constant(32 : i64) : i64
+    %246 = llvm.add %136, %245 : i64
+    %247 = llvm.mlir.constant(736 : i64) : i64
+    %248 = llvm.add %130, %247 : i64
+    %249 = llvm.mlir.constant(18014535950532609 : i64) : i64
+    %250 = llvm.mlir.constant(4296933406 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config1"(%249, %250) : (i64, i64) -> ()
+    %251 = llvm.mlir.constant(844429225164800 : i64) : i64
+    %252 = llvm.mlir.constant(281569466449936 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config2"(%251, %252) : (i64, i64) -> ()
+    %253 = llvm.mlir.constant(844437817131008 : i64) : i64
+    %254 = llvm.mlir.constant(0 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config3"(%253, %254) : (i64, i64) -> ()
+    %255 = llvm.mlir.constant(6192449487634432 : i64) : i64
+    %256 = llvm.mlir.constant(65543 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config4"(%255, %256) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config5"(%246, %242) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config6"(%244, %248) : (i64, i64) -> ()
+    %257 = llvm.mlir.constant(256 : i64) : i64
+    %258 = llvm.mlir.constant(1 : i64) : i64
+    "gemmini.intr.loop_conv_ws"(%257, %258) : (i64, i64) -> ()
+    %259 = llvm.mlir.constant(1520 : i64) : i64
+    %260 = llvm.add %132, %259 : i64
+    %261 = llvm.mlir.constant(192 : i64) : i64
+    %262 = llvm.add %134, %261 : i64
+    %263 = llvm.mlir.constant(48 : i64) : i64
+    %264 = llvm.add %136, %263 : i64
+    %265 = llvm.mlir.constant(736 : i64) : i64
+    %266 = llvm.add %130, %265 : i64
+    %267 = llvm.mlir.constant(18014535950532609 : i64) : i64
+    %268 = llvm.mlir.constant(4296933406 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config1"(%267, %268) : (i64, i64) -> ()
+    %269 = llvm.mlir.constant(844429225164800 : i64) : i64
+    %270 = llvm.mlir.constant(281569466449936 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config2"(%269, %270) : (i64, i64) -> ()
+    %271 = llvm.mlir.constant(844437817131008 : i64) : i64
+    %272 = llvm.mlir.constant(0 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config3"(%271, %272) : (i64, i64) -> ()
+    %273 = llvm.mlir.constant(6192449487634432 : i64) : i64
+    %274 = llvm.mlir.constant(65543 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config4"(%273, %274) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config5"(%264, %260) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config6"(%262, %266) : (i64, i64) -> ()
+    %275 = llvm.mlir.constant(256 : i64) : i64
+    %276 = llvm.mlir.constant(1 : i64) : i64
+    "gemmini.intr.loop_conv_ws"(%275, %276) : (i64, i64) -> ()
+    %277 = llvm.mlir.constant(42240 : i64) : i64
+    %278 = llvm.add %132, %277 : i64
+    %279 = llvm.mlir.constant(0 : i64) : i64
+    %280 = llvm.mlir.constant(0 : i64) : i64
+    %281 = llvm.mlir.constant(22528 : i64) : i64
+    %282 = llvm.add %130, %281 : i64
+    %283 = llvm.mlir.constant(18014535950532609 : i64) : i64
+    %284 = llvm.mlir.constant(4296933406 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config1"(%283, %284) : (i64, i64) -> ()
+    %285 = llvm.mlir.constant(844429225164800 : i64) : i64
+    %286 = llvm.mlir.constant(281509337956368 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config2"(%285, %286) : (i64, i64) -> ()
+    %287 = llvm.mlir.constant(844437817131008 : i64) : i64
+    %288 = llvm.mlir.constant(0 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config3"(%287, %288) : (i64, i64) -> ()
+    %289 = llvm.mlir.constant(2251799813685248 : i64) : i64
+    %290 = llvm.mlir.constant(65559 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config4"(%289, %290) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config5"(%136, %278) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config6"(%134, %282) : (i64, i64) -> ()
+    %291 = llvm.mlir.constant(256 : i64) : i64
+    %292 = llvm.mlir.constant(1 : i64) : i64
+    "gemmini.intr.loop_conv_ws"(%291, %292) : (i64, i64) -> ()
+    %293 = llvm.mlir.constant(42256 : i64) : i64
+    %294 = llvm.add %132, %293 : i64
+    %295 = llvm.mlir.constant(64 : i64) : i64
+    %296 = llvm.add %134, %295 : i64
+    %297 = llvm.mlir.constant(16 : i64) : i64
+    %298 = llvm.add %136, %297 : i64
+    %299 = llvm.mlir.constant(22528 : i64) : i64
+    %300 = llvm.add %130, %299 : i64
+    %301 = llvm.mlir.constant(18014535950532609 : i64) : i64
+    %302 = llvm.mlir.constant(4296933406 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config1"(%301, %302) : (i64, i64) -> ()
+    %303 = llvm.mlir.constant(844429225164800 : i64) : i64
+    %304 = llvm.mlir.constant(281509337956368 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config2"(%303, %304) : (i64, i64) -> ()
+    %305 = llvm.mlir.constant(844437817131008 : i64) : i64
+    %306 = llvm.mlir.constant(0 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config3"(%305, %306) : (i64, i64) -> ()
+    %307 = llvm.mlir.constant(2251799813685248 : i64) : i64
+    %308 = llvm.mlir.constant(65559 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config4"(%307, %308) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config5"(%298, %294) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config6"(%296, %300) : (i64, i64) -> ()
+    %309 = llvm.mlir.constant(256 : i64) : i64
+    %310 = llvm.mlir.constant(1 : i64) : i64
+    "gemmini.intr.loop_conv_ws"(%309, %310) : (i64, i64) -> ()
+    %311 = llvm.mlir.constant(42272 : i64) : i64
+    %312 = llvm.add %132, %311 : i64
+    %313 = llvm.mlir.constant(128 : i64) : i64
+    %314 = llvm.add %134, %313 : i64
+    %315 = llvm.mlir.constant(32 : i64) : i64
+    %316 = llvm.add %136, %315 : i64
+    %317 = llvm.mlir.constant(22528 : i64) : i64
+    %318 = llvm.add %130, %317 : i64
+    %319 = llvm.mlir.constant(18014535950532609 : i64) : i64
+    %320 = llvm.mlir.constant(4296933406 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config1"(%319, %320) : (i64, i64) -> ()
+    %321 = llvm.mlir.constant(844429225164800 : i64) : i64
+    %322 = llvm.mlir.constant(281509337956368 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config2"(%321, %322) : (i64, i64) -> ()
+    %323 = llvm.mlir.constant(844437817131008 : i64) : i64
+    %324 = llvm.mlir.constant(0 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config3"(%323, %324) : (i64, i64) -> ()
+    %325 = llvm.mlir.constant(2251799813685248 : i64) : i64
+    %326 = llvm.mlir.constant(65559 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config4"(%325, %326) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config5"(%316, %312) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config6"(%314, %318) : (i64, i64) -> ()
+    %327 = llvm.mlir.constant(256 : i64) : i64
+    %328 = llvm.mlir.constant(1 : i64) : i64
+    "gemmini.intr.loop_conv_ws"(%327, %328) : (i64, i64) -> ()
+    %329 = llvm.mlir.constant(42288 : i64) : i64
+    %330 = llvm.add %132, %329 : i64
+    %331 = llvm.mlir.constant(192 : i64) : i64
+    %332 = llvm.add %134, %331 : i64
+    %333 = llvm.mlir.constant(48 : i64) : i64
+    %334 = llvm.add %136, %333 : i64
+    %335 = llvm.mlir.constant(22528 : i64) : i64
+    %336 = llvm.add %130, %335 : i64
+    %337 = llvm.mlir.constant(18014535950532609 : i64) : i64
+    %338 = llvm.mlir.constant(4296933406 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config1"(%337, %338) : (i64, i64) -> ()
+    %339 = llvm.mlir.constant(844429225164800 : i64) : i64
+    %340 = llvm.mlir.constant(281509337956368 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config2"(%339, %340) : (i64, i64) -> ()
+    %341 = llvm.mlir.constant(844437817131008 : i64) : i64
+    %342 = llvm.mlir.constant(0 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config3"(%341, %342) : (i64, i64) -> ()
+    %343 = llvm.mlir.constant(2251799813685248 : i64) : i64
+    %344 = llvm.mlir.constant(65559 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config4"(%343, %344) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config5"(%334, %330) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config6"(%332, %336) : (i64, i64) -> ()
+    %345 = llvm.mlir.constant(256 : i64) : i64
+    %346 = llvm.mlir.constant(1 : i64) : i64
+    "gemmini.intr.loop_conv_ws"(%345, %346) : (i64, i64) -> ()
+    %347 = llvm.mlir.constant(43712 : i64) : i64
+    %348 = llvm.add %132, %347 : i64
+    %349 = llvm.mlir.constant(0 : i64) : i64
+    %350 = llvm.mlir.constant(0 : i64) : i64
+    %351 = llvm.mlir.constant(23264 : i64) : i64
+    %352 = llvm.add %130, %351 : i64
+    %353 = llvm.mlir.constant(18014535950532609 : i64) : i64
+    %354 = llvm.mlir.constant(4296933406 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config1"(%353, %354) : (i64, i64) -> ()
+    %355 = llvm.mlir.constant(844429225164800 : i64) : i64
+    %356 = llvm.mlir.constant(281509336907792 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config2"(%355, %356) : (i64, i64) -> ()
+    %357 = llvm.mlir.constant(844437817131008 : i64) : i64
+    %358 = llvm.mlir.constant(0 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config3"(%357, %358) : (i64, i64) -> ()
+    %359 = llvm.mlir.constant(2251799813685248 : i64) : i64
+    %360 = llvm.mlir.constant(65543 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config4"(%359, %360) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config5"(%136, %348) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config6"(%134, %352) : (i64, i64) -> ()
+    %361 = llvm.mlir.constant(256 : i64) : i64
+    %362 = llvm.mlir.constant(1 : i64) : i64
+    "gemmini.intr.loop_conv_ws"(%361, %362) : (i64, i64) -> ()
+    %363 = llvm.mlir.constant(43728 : i64) : i64
+    %364 = llvm.add %132, %363 : i64
+    %365 = llvm.mlir.constant(64 : i64) : i64
+    %366 = llvm.add %134, %365 : i64
+    %367 = llvm.mlir.constant(16 : i64) : i64
+    %368 = llvm.add %136, %367 : i64
+    %369 = llvm.mlir.constant(23264 : i64) : i64
+    %370 = llvm.add %130, %369 : i64
+    %371 = llvm.mlir.constant(18014535950532609 : i64) : i64
+    %372 = llvm.mlir.constant(4296933406 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config1"(%371, %372) : (i64, i64) -> ()
+    %373 = llvm.mlir.constant(844429225164800 : i64) : i64
+    %374 = llvm.mlir.constant(281509336907792 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config2"(%373, %374) : (i64, i64) -> ()
+    %375 = llvm.mlir.constant(844437817131008 : i64) : i64
+    %376 = llvm.mlir.constant(0 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config3"(%375, %376) : (i64, i64) -> ()
+    %377 = llvm.mlir.constant(2251799813685248 : i64) : i64
+    %378 = llvm.mlir.constant(65543 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config4"(%377, %378) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config5"(%368, %364) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config6"(%366, %370) : (i64, i64) -> ()
+    %379 = llvm.mlir.constant(256 : i64) : i64
+    %380 = llvm.mlir.constant(1 : i64) : i64
+    "gemmini.intr.loop_conv_ws"(%379, %380) : (i64, i64) -> ()
+    %381 = llvm.mlir.constant(43744 : i64) : i64
+    %382 = llvm.add %132, %381 : i64
+    %383 = llvm.mlir.constant(128 : i64) : i64
+    %384 = llvm.add %134, %383 : i64
+    %385 = llvm.mlir.constant(32 : i64) : i64
+    %386 = llvm.add %136, %385 : i64
+    %387 = llvm.mlir.constant(23264 : i64) : i64
+    %388 = llvm.add %130, %387 : i64
+    %389 = llvm.mlir.constant(18014535950532609 : i64) : i64
+    %390 = llvm.mlir.constant(4296933406 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config1"(%389, %390) : (i64, i64) -> ()
+    %391 = llvm.mlir.constant(844429225164800 : i64) : i64
+    %392 = llvm.mlir.constant(281509336907792 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config2"(%391, %392) : (i64, i64) -> ()
+    %393 = llvm.mlir.constant(844437817131008 : i64) : i64
+    %394 = llvm.mlir.constant(0 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config3"(%393, %394) : (i64, i64) -> ()
+    %395 = llvm.mlir.constant(2251799813685248 : i64) : i64
+    %396 = llvm.mlir.constant(65543 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config4"(%395, %396) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config5"(%386, %382) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config6"(%384, %388) : (i64, i64) -> ()
+    %397 = llvm.mlir.constant(256 : i64) : i64
+    %398 = llvm.mlir.constant(1 : i64) : i64
+    "gemmini.intr.loop_conv_ws"(%397, %398) : (i64, i64) -> ()
+    %399 = llvm.mlir.constant(43760 : i64) : i64
+    %400 = llvm.add %132, %399 : i64
+    %401 = llvm.mlir.constant(192 : i64) : i64
+    %402 = llvm.add %134, %401 : i64
+    %403 = llvm.mlir.constant(48 : i64) : i64
+    %404 = llvm.add %136, %403 : i64
+    %405 = llvm.mlir.constant(23264 : i64) : i64
+    %406 = llvm.add %130, %405 : i64
+    %407 = llvm.mlir.constant(18014535950532609 : i64) : i64
+    %408 = llvm.mlir.constant(4296933406 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config1"(%407, %408) : (i64, i64) -> ()
+    %409 = llvm.mlir.constant(844429225164800 : i64) : i64
+    %410 = llvm.mlir.constant(281509336907792 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config2"(%409, %410) : (i64, i64) -> ()
+    %411 = llvm.mlir.constant(844437817131008 : i64) : i64
+    %412 = llvm.mlir.constant(0 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config3"(%411, %412) : (i64, i64) -> ()
+    %413 = llvm.mlir.constant(2251799813685248 : i64) : i64
+    %414 = llvm.mlir.constant(65543 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config4"(%413, %414) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config5"(%404, %400) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config6"(%402, %406) : (i64, i64) -> ()
+    %415 = llvm.mlir.constant(256 : i64) : i64
+    %416 = llvm.mlir.constant(1 : i64) : i64
+    "gemmini.intr.loop_conv_ws"(%415, %416) : (i64, i64) -> ()
+    %417 = llvm.mlir.constant(0 : i64) : i64
+    "gemmini.intr.flush"(%417, %417) : (i64, i64) -> ()
+    %418 = llvm.mlir.constant(1 : index) : i64
+    llvm.br ^bb13(%85 : i64)
+  ^bb13(%419: i64):  // 2 preds: ^bb12, ^bb23
+    %420 = llvm.icmp "slt" %419, %418 : i64
+    llvm.cond_br %420, ^bb14, ^bb24
+  ^bb14:  // pred: ^bb13
+    %421 = llvm.mlir.constant(30 : index) : i64
+    llvm.br ^bb15(%85 : i64)
+  ^bb15(%422: i64):  // 2 preds: ^bb14, ^bb22
+    %423 = llvm.icmp "slt" %422, %421 : i64
+    llvm.cond_br %423, ^bb16, ^bb23
+  ^bb16:  // pred: ^bb15
+    %424 = llvm.mlir.constant(30 : index) : i64
+    llvm.br ^bb17(%85 : i64)
+  ^bb17(%425: i64):  // 2 preds: ^bb16, ^bb21
+    %426 = llvm.icmp "slt" %425, %424 : i64
+    llvm.cond_br %426, ^bb18, ^bb22
+  ^bb18:  // pred: ^bb17
+    %427 = llvm.mlir.constant(64 : index) : i64
+    llvm.br ^bb19(%85 : i64)
+  ^bb19(%428: i64):  // 2 preds: ^bb18, ^bb20
+    %429 = llvm.icmp "slt" %428, %427 : i64
+    llvm.cond_br %429, ^bb20, ^bb21
+  ^bb20:  // pred: ^bb19
+    %430 = llvm.mlir.constant(30 : index) : i64
+    %431 = llvm.mul %419, %430 : i64
+    %432 = llvm.mul %431, %430 : i64
+    %433 = llvm.mul %422, %430 : i64
+    %434 = llvm.add %432, %433 : i64
+    %435 = llvm.add %434, %425 : i64
+    %436 = llvm.extractvalue %69[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %437 = llvm.mlir.constant(64 : index) : i64
+    %438 = llvm.mul %435, %437 : i64
+    %439 = llvm.add %438, %428 : i64
+    %440 = llvm.getelementptr %436[%439] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+    %441 = llvm.load %440 : !llvm.ptr -> f32
+    %442 = llvm.extractvalue %11[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %443 = llvm.mlir.constant(57600 : index) : i64
+    %444 = llvm.mul %419, %443 : i64
+    %445 = llvm.mlir.constant(1920 : index) : i64
+    %446 = llvm.mul %422, %445 : i64
+    %447 = llvm.add %444, %446 : i64
+    %448 = llvm.mlir.constant(64 : index) : i64
+    %449 = llvm.mul %425, %448 : i64
+    %450 = llvm.add %447, %449 : i64
+    %451 = llvm.add %450, %428 : i64
+    %452 = llvm.getelementptr %442[%451] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+    llvm.store %441, %452 : f32, !llvm.ptr
+    %453 = llvm.add %428, %86 : i64
+    llvm.br ^bb19(%453 : i64)
+  ^bb21:  // pred: ^bb19
+    %454 = llvm.add %425, %86 : i64
+    llvm.br ^bb17(%454 : i64)
+  ^bb22:  // pred: ^bb17
+    %455 = llvm.add %422, %86 : i64
+    llvm.br ^bb15(%455 : i64)
+  ^bb23:  // pred: ^bb15
+    %456 = llvm.add %419, %86 : i64
+    llvm.br ^bb13(%456 : i64)
+  ^bb24:  // pred: ^bb13
+    %457 = llvm.extractvalue %52[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    llvm.call @free(%457) : (!llvm.ptr) -> ()
+    %458 = llvm.extractvalue %69[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    llvm.call @free(%458) : (!llvm.ptr) -> ()
+    %459 = llvm.extractvalue %82[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
+    llvm.call @free(%459) : (!llvm.ptr) -> ()
+    llvm.return
+  }
+}
+
+
diff --git a/experiments/gemmini/networks/conv2d_block1.mlir b/experiments/gemmini/networks/conv2d_block1.mlir
new file mode 100644
index 0000000..32a6ba7
--- /dev/null
+++ b/experiments/gemmini/networks/conv2d_block1.mlir
@@ -0,0 +1,13 @@
+module {
+  func.func @conv2d_block1(
+      %input: memref<1x32x32x32xf16>,   // [N,H,W,C_in]
+      %filter: memref<3x3x32x64xf16>,   // [KH,KW,C_in,C_out]
+      %output: memref<1x30x30x64xf32>   // [N,H_out,W_out,C_out]
+  ) {
+    // linalg conv2d in NHWC x HWCF
+    linalg.conv_2d_nhwc_hwcf
+      ins(%input, %filter : memref<1x32x32x32xf16>, memref<3x3x32x64xf16>)
+      outs(%output : memref<1x30x30x64xf32>)
+    return
+  }
+}

From 860d350b12c0fba77b8a262d886ef975f3ad1251 Mon Sep 17 00:00:00 2001
From: sparsh <sparshsingh@berkeley.edu>
Date: Thu, 18 Dec 2025 01:11:07 -0800
Subject: [PATCH 07/13] Buddy Gemmini: add NCHW conv2d block lowering test

---
 .../conv2d_block_nchw.print-after-all.mlir    | 835 ++++++++++++++++++
 .../gemmini/networks/conv2d_block_nchw.mlir   |  17 +
 2 files changed, 852 insertions(+)
 create mode 100644 experiments/gemmini/logs/conv2d_block_nchw.print-after-all.mlir
 create mode 100644 experiments/gemmini/networks/conv2d_block_nchw.mlir

diff --git a/experiments/gemmini/logs/conv2d_block_nchw.print-after-all.mlir b/experiments/gemmini/logs/conv2d_block_nchw.print-after-all.mlir
new file mode 100644
index 0000000..fba1d9e
--- /dev/null
+++ b/experiments/gemmini/logs/conv2d_block_nchw.print-after-all.mlir
@@ -0,0 +1,835 @@
+// -----// IR Dump After (anonymous namespace)::LowerLinalgToGemminiPass (convert-linalg-to-gemmini) //----- //
+module {
+  func.func @conv2d_block_nchw(%arg0: memref<1x3x32x32xf32>, %arg1: memref<64x3x3x3xf32>, %arg2: memref<1x64x30x30xf32>) {
+    %alloc = memref.alloc() : memref<1x32x32x3xf32>
+    %alloc_0 = memref.alloc() : memref<27x64xf32>
+    %alloc_1 = memref.alloc() : memref<64xi32>
+    %alloc_2 = memref.alloc() : memref<900x64xf32>
+    %c30_i64 = arith.constant 30 : i64
+    %c3 = arith.constant 3 : index
+    %c3_3 = arith.constant 3 : index
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c1_4 = arith.constant 1 : index
+    scf.for %arg3 = %c0 to %c1 step %c1_4 {
+      %c0_10 = arith.constant 0 : index
+      %c3_11 = arith.constant 3 : index
+      %c1_12 = arith.constant 1 : index
+      scf.for %arg4 = %c0_10 to %c3_11 step %c1_12 {
+        %c0_13 = arith.constant 0 : index
+        %c32 = arith.constant 32 : index
+        %c1_14 = arith.constant 1 : index
+        scf.for %arg5 = %c0_13 to %c32 step %c1_14 {
+          %c0_15 = arith.constant 0 : index
+          %c32_16 = arith.constant 32 : index
+          %c1_17 = arith.constant 1 : index
+          scf.for %arg6 = %c0_15 to %c32_16 step %c1_17 {
+            %0 = memref.load %arg0[%arg3, %arg4, %arg5, %arg6] : memref<1x3x32x32xf32>
+            memref.store %0, %alloc[%arg3, %arg5, %arg6, %arg4] : memref<1x32x32x3xf32>
+          }
+        }
+      }
+    }
+    %c0_5 = arith.constant 0 : index
+    %c64 = arith.constant 64 : index
+    %c1_6 = arith.constant 1 : index
+    scf.for %arg3 = %c0_5 to %c64 step %c1_6 {
+      %c0_10 = arith.constant 0 : index
+      %c3_11 = arith.constant 3 : index
+      %c1_12 = arith.constant 1 : index
+      scf.for %arg4 = %c0_10 to %c3_11 step %c1_12 {
+        %c0_13 = arith.constant 0 : index
+        %c3_14 = arith.constant 3 : index
+        %c1_15 = arith.constant 1 : index
+        scf.for %arg5 = %c0_13 to %c3_14 step %c1_15 {
+          %c0_16 = arith.constant 0 : index
+          %c3_17 = arith.constant 3 : index
+          %c1_18 = arith.constant 1 : index
+          scf.for %arg6 = %c0_16 to %c3_17 step %c1_18 {
+            %0 = arith.muli %arg5, %c3 : index
+            %1 = arith.muli %0, %c3_3 : index
+            %2 = arith.muli %arg6, %c3_3 : index
+            %3 = arith.addi %1, %2 : index
+            %4 = arith.addi %3, %arg4 : index
+            %5 = memref.load %arg1[%arg3, %arg4, %arg5, %arg6] : memref<64x3x3x3xf32>
+            memref.store %5, %alloc_0[%4, %arg3] : memref<27x64xf32>
+          }
+        }
+      }
+    }
+    %c3_i64 = arith.constant 3 : i64
+    gemmini.tile_conv %alloc %alloc_0 %alloc_1 %alloc_2 %c30_i64 %c30_i64 %c3_i64 : memref<1x32x32x3xf32> memref<27x64xf32> memref<64xi32> memref<900x64xf32> i64 i64 i64
+    %c0_7 = arith.constant 0 : index
+    %c1_8 = arith.constant 1 : index
+    %c1_9 = arith.constant 1 : index
+    scf.for %arg3 = %c0_7 to %c1_8 step %c1_9 {
+      %c0_10 = arith.constant 0 : index
+      %c64_11 = arith.constant 64 : index
+      %c1_12 = arith.constant 1 : index
+      scf.for %arg4 = %c0_10 to %c64_11 step %c1_12 {
+        %c0_13 = arith.constant 0 : index
+        %c30 = arith.constant 30 : index
+        %c1_14 = arith.constant 1 : index
+        scf.for %arg5 = %c0_13 to %c30 step %c1_14 {
+          %c0_15 = arith.constant 0 : index
+          %c30_16 = arith.constant 30 : index
+          %c1_17 = arith.constant 1 : index
+          scf.for %arg6 = %c0_15 to %c30_16 step %c1_17 {
+            %c30_18 = arith.constant 30 : index
+            %0 = arith.muli %arg3, %c30_18 : index
+            %1 = arith.muli %0, %c30_18 : index
+            %2 = arith.muli %arg5, %c30_18 : index
+            %3 = arith.addi %1, %2 : index
+            %4 = arith.addi %3, %arg6 : index
+            %5 = memref.load %alloc_2[%4, %arg4] : memref<900x64xf32>
+            memref.store %5, %arg2[%arg3, %arg4, %arg5, %arg6] : memref<1x64x30x30xf32>
+          }
+        }
+      }
+    }
+    memref.dealloc %alloc : memref<1x32x32x3xf32>
+    memref.dealloc %alloc_0 : memref<27x64xf32>
+    memref.dealloc %alloc_2 : memref<900x64xf32>
+    memref.dealloc %alloc_1 : memref<64xi32>
+    return
+  }
+}
+
+
+// -----// IR Dump After (anonymous namespace)::LowerGemminiToLLVMPass (lower-gemmini) //----- //
+module {
+  llvm.func @free(!llvm.ptr)
+  llvm.func @malloc(i64) -> !llvm.ptr
+  llvm.func @conv2d_block_nchw(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr, %arg23: !llvm.ptr, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64) {
+    %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
+    %1 = llvm.insertvalue %arg22, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %2 = llvm.insertvalue %arg23, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %3 = llvm.insertvalue %arg24, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %4 = llvm.insertvalue %arg25, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %5 = llvm.insertvalue %arg29, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %6 = llvm.insertvalue %arg26, %5[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %7 = llvm.insertvalue %arg30, %6[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %8 = llvm.insertvalue %arg27, %7[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %9 = llvm.insertvalue %arg31, %8[4, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %10 = llvm.insertvalue %arg28, %9[3, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %11 = llvm.insertvalue %arg32, %10[4, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %12 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
+    %13 = llvm.insertvalue %arg11, %12[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %14 = llvm.insertvalue %arg12, %13[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %15 = llvm.insertvalue %arg13, %14[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %16 = llvm.insertvalue %arg14, %15[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %17 = llvm.insertvalue %arg18, %16[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %18 = llvm.insertvalue %arg15, %17[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %19 = llvm.insertvalue %arg19, %18[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %20 = llvm.insertvalue %arg16, %19[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %21 = llvm.insertvalue %arg20, %20[4, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %22 = llvm.insertvalue %arg17, %21[3, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %23 = llvm.insertvalue %arg21, %22[4, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %24 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
+    %25 = llvm.insertvalue %arg0, %24[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %26 = llvm.insertvalue %arg1, %25[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %27 = llvm.insertvalue %arg2, %26[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %28 = llvm.insertvalue %arg3, %27[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %29 = llvm.insertvalue %arg7, %28[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %30 = llvm.insertvalue %arg4, %29[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %31 = llvm.insertvalue %arg8, %30[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %32 = llvm.insertvalue %arg5, %31[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %33 = llvm.insertvalue %arg9, %32[4, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %34 = llvm.insertvalue %arg6, %33[3, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %35 = llvm.insertvalue %arg10, %34[4, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %36 = llvm.mlir.constant(1 : index) : i64
+    %37 = llvm.mlir.constant(32 : index) : i64
+    %38 = llvm.mlir.constant(32 : index) : i64
+    %39 = llvm.mlir.constant(3 : index) : i64
+    %40 = llvm.mlir.constant(1 : index) : i64
+    %41 = llvm.mlir.constant(96 : index) : i64
+    %42 = llvm.mlir.constant(3072 : index) : i64
+    %43 = llvm.mlir.constant(3072 : index) : i64
+    %44 = llvm.mlir.zero : !llvm.ptr
+    %45 = llvm.getelementptr %44[%43] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+    %46 = llvm.ptrtoint %45 : !llvm.ptr to i64
+    %47 = llvm.call @malloc(%46) : (i64) -> !llvm.ptr
+    %48 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
+    %49 = llvm.insertvalue %47, %48[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %50 = llvm.insertvalue %47, %49[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %51 = llvm.mlir.constant(0 : index) : i64
+    %52 = llvm.insertvalue %51, %50[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %53 = llvm.insertvalue %36, %52[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %54 = llvm.insertvalue %37, %53[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %55 = llvm.insertvalue %38, %54[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %56 = llvm.insertvalue %39, %55[3, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %57 = llvm.insertvalue %42, %56[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %58 = llvm.insertvalue %41, %57[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %59 = llvm.insertvalue %39, %58[4, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %60 = llvm.insertvalue %40, %59[4, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %61 = llvm.mlir.constant(27 : index) : i64
+    %62 = llvm.mlir.constant(64 : index) : i64
+    %63 = llvm.mlir.constant(1 : index) : i64
+    %64 = llvm.mlir.constant(1728 : index) : i64
+    %65 = llvm.mlir.zero : !llvm.ptr
+    %66 = llvm.getelementptr %65[%64] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+    %67 = llvm.ptrtoint %66 : !llvm.ptr to i64
+    %68 = llvm.call @malloc(%67) : (i64) -> !llvm.ptr
+    %69 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+    %70 = llvm.insertvalue %68, %69[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %71 = llvm.insertvalue %68, %70[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %72 = llvm.mlir.constant(0 : index) : i64
+    %73 = llvm.insertvalue %72, %71[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %74 = llvm.insertvalue %61, %73[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %75 = llvm.insertvalue %62, %74[3, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %76 = llvm.insertvalue %62, %75[4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %77 = llvm.insertvalue %63, %76[4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %78 = llvm.mlir.constant(64 : index) : i64
+    %79 = llvm.mlir.constant(1 : index) : i64
+    %80 = llvm.mlir.zero : !llvm.ptr
+    %81 = llvm.getelementptr %80[%78] : (!llvm.ptr, i64) -> !llvm.ptr, i32
+    %82 = llvm.ptrtoint %81 : !llvm.ptr to i64
+    %83 = llvm.call @malloc(%82) : (i64) -> !llvm.ptr
+    %84 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+    %85 = llvm.insertvalue %83, %84[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
+    %86 = llvm.insertvalue %83, %85[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
+    %87 = llvm.mlir.constant(0 : index) : i64
+    %88 = llvm.insertvalue %87, %86[2] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
+    %89 = llvm.insertvalue %78, %88[3, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
+    %90 = llvm.insertvalue %79, %89[4, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
+    %91 = llvm.mlir.constant(900 : index) : i64
+    %92 = llvm.mlir.constant(64 : index) : i64
+    %93 = llvm.mlir.constant(1 : index) : i64
+    %94 = llvm.mlir.constant(57600 : index) : i64
+    %95 = llvm.mlir.zero : !llvm.ptr
+    %96 = llvm.getelementptr %95[%94] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+    %97 = llvm.ptrtoint %96 : !llvm.ptr to i64
+    %98 = llvm.call @malloc(%97) : (i64) -> !llvm.ptr
+    %99 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+    %100 = llvm.insertvalue %98, %99[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %101 = llvm.insertvalue %98, %100[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %102 = llvm.mlir.constant(0 : index) : i64
+    %103 = llvm.insertvalue %102, %101[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %104 = llvm.insertvalue %91, %103[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %105 = llvm.insertvalue %92, %104[3, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %106 = llvm.insertvalue %92, %105[4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %107 = llvm.insertvalue %93, %106[4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %108 = llvm.mlir.constant(30 : i64) : i64
+    %109 = llvm.mlir.constant(3 : index) : i64
+    %110 = llvm.mlir.constant(3 : index) : i64
+    %111 = llvm.mlir.constant(0 : index) : i64
+    %112 = llvm.mlir.constant(1 : index) : i64
+    %113 = llvm.mlir.constant(1 : index) : i64
+    llvm.br ^bb1(%111 : i64)
+  ^bb1(%114: i64):  // 2 preds: ^bb0, ^bb11
+    %115 = llvm.icmp "slt" %114, %112 : i64
+    llvm.cond_br %115, ^bb2, ^bb12
+  ^bb2:  // pred: ^bb1
+    %116 = llvm.mlir.constant(0 : index) : i64
+    %117 = llvm.mlir.constant(3 : index) : i64
+    %118 = llvm.mlir.constant(1 : index) : i64
+    llvm.br ^bb3(%116 : i64)
+  ^bb3(%119: i64):  // 2 preds: ^bb2, ^bb10
+    %120 = llvm.icmp "slt" %119, %117 : i64
+    llvm.cond_br %120, ^bb4, ^bb11
+  ^bb4:  // pred: ^bb3
+    %121 = llvm.mlir.constant(0 : index) : i64
+    %122 = llvm.mlir.constant(32 : index) : i64
+    %123 = llvm.mlir.constant(1 : index) : i64
+    llvm.br ^bb5(%121 : i64)
+  ^bb5(%124: i64):  // 2 preds: ^bb4, ^bb9
+    %125 = llvm.icmp "slt" %124, %122 : i64
+    llvm.cond_br %125, ^bb6, ^bb10
+  ^bb6:  // pred: ^bb5
+    %126 = llvm.mlir.constant(0 : index) : i64
+    %127 = llvm.mlir.constant(32 : index) : i64
+    %128 = llvm.mlir.constant(1 : index) : i64
+    llvm.br ^bb7(%126 : i64)
+  ^bb7(%129: i64):  // 2 preds: ^bb6, ^bb8
+    %130 = llvm.icmp "slt" %129, %127 : i64
+    llvm.cond_br %130, ^bb8, ^bb9
+  ^bb8:  // pred: ^bb7
+    %131 = llvm.extractvalue %35[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %132 = llvm.mlir.constant(3072 : index) : i64
+    %133 = llvm.mul %114, %132 : i64
+    %134 = llvm.mlir.constant(1024 : index) : i64
+    %135 = llvm.mul %119, %134 : i64
+    %136 = llvm.add %133, %135 : i64
+    %137 = llvm.mlir.constant(32 : index) : i64
+    %138 = llvm.mul %124, %137 : i64
+    %139 = llvm.add %136, %138 : i64
+    %140 = llvm.add %139, %129 : i64
+    %141 = llvm.getelementptr %131[%140] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+    %142 = llvm.load %141 : !llvm.ptr -> f32
+    %143 = llvm.extractvalue %60[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %144 = llvm.mlir.constant(3072 : index) : i64
+    %145 = llvm.mul %114, %144 : i64
+    %146 = llvm.mlir.constant(96 : index) : i64
+    %147 = llvm.mul %124, %146 : i64
+    %148 = llvm.add %145, %147 : i64
+    %149 = llvm.mlir.constant(3 : index) : i64
+    %150 = llvm.mul %129, %149 : i64
+    %151 = llvm.add %148, %150 : i64
+    %152 = llvm.add %151, %119 : i64
+    %153 = llvm.getelementptr %143[%152] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+    llvm.store %142, %153 : f32, !llvm.ptr
+    %154 = llvm.add %129, %128 : i64
+    llvm.br ^bb7(%154 : i64)
+  ^bb9:  // pred: ^bb7
+    %155 = llvm.add %124, %123 : i64
+    llvm.br ^bb5(%155 : i64)
+  ^bb10:  // pred: ^bb5
+    %156 = llvm.add %119, %118 : i64
+    llvm.br ^bb3(%156 : i64)
+  ^bb11:  // pred: ^bb3
+    %157 = llvm.add %114, %113 : i64
+    llvm.br ^bb1(%157 : i64)
+  ^bb12:  // pred: ^bb1
+    %158 = llvm.mlir.constant(0 : index) : i64
+    %159 = llvm.mlir.constant(64 : index) : i64
+    %160 = llvm.mlir.constant(1 : index) : i64
+    llvm.br ^bb13(%158 : i64)
+  ^bb13(%161: i64):  // 2 preds: ^bb12, ^bb23
+    %162 = llvm.icmp "slt" %161, %159 : i64
+    llvm.cond_br %162, ^bb14, ^bb24
+  ^bb14:  // pred: ^bb13
+    %163 = llvm.mlir.constant(0 : index) : i64
+    %164 = llvm.mlir.constant(3 : index) : i64
+    %165 = llvm.mlir.constant(1 : index) : i64
+    llvm.br ^bb15(%163 : i64)
+  ^bb15(%166: i64):  // 2 preds: ^bb14, ^bb22
+    %167 = llvm.icmp "slt" %166, %164 : i64
+    llvm.cond_br %167, ^bb16, ^bb23
+  ^bb16:  // pred: ^bb15
+    %168 = llvm.mlir.constant(0 : index) : i64
+    %169 = llvm.mlir.constant(3 : index) : i64
+    %170 = llvm.mlir.constant(1 : index) : i64
+    llvm.br ^bb17(%168 : i64)
+  ^bb17(%171: i64):  // 2 preds: ^bb16, ^bb21
+    %172 = llvm.icmp "slt" %171, %169 : i64
+    llvm.cond_br %172, ^bb18, ^bb22
+  ^bb18:  // pred: ^bb17
+    %173 = llvm.mlir.constant(0 : index) : i64
+    %174 = llvm.mlir.constant(3 : index) : i64
+    %175 = llvm.mlir.constant(1 : index) : i64
+    llvm.br ^bb19(%173 : i64)
+  ^bb19(%176: i64):  // 2 preds: ^bb18, ^bb20
+    %177 = llvm.icmp "slt" %176, %174 : i64
+    llvm.cond_br %177, ^bb20, ^bb21
+  ^bb20:  // pred: ^bb19
+    %178 = llvm.mul %171, %109 : i64
+    %179 = llvm.mul %178, %110 : i64
+    %180 = llvm.mul %176, %110 : i64
+    %181 = llvm.add %179, %180 : i64
+    %182 = llvm.add %181, %166 : i64
+    %183 = llvm.extractvalue %23[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %184 = llvm.mlir.constant(27 : index) : i64
+    %185 = llvm.mul %161, %184 : i64
+    %186 = llvm.mlir.constant(9 : index) : i64
+    %187 = llvm.mul %166, %186 : i64
+    %188 = llvm.add %185, %187 : i64
+    %189 = llvm.mlir.constant(3 : index) : i64
+    %190 = llvm.mul %171, %189 : i64
+    %191 = llvm.add %188, %190 : i64
+    %192 = llvm.add %191, %176 : i64
+    %193 = llvm.getelementptr %183[%192] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+    %194 = llvm.load %193 : !llvm.ptr -> f32
+    %195 = llvm.extractvalue %77[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %196 = llvm.mlir.constant(64 : index) : i64
+    %197 = llvm.mul %182, %196 : i64
+    %198 = llvm.add %197, %161 : i64
+    %199 = llvm.getelementptr %195[%198] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+    llvm.store %194, %199 : f32, !llvm.ptr
+    %200 = llvm.add %176, %175 : i64
+    llvm.br ^bb19(%200 : i64)
+  ^bb21:  // pred: ^bb19
+    %201 = llvm.add %171, %170 : i64
+    llvm.br ^bb17(%201 : i64)
+  ^bb22:  // pred: ^bb17
+    %202 = llvm.add %166, %165 : i64
+    llvm.br ^bb15(%202 : i64)
+  ^bb23:  // pred: ^bb15
+    %203 = llvm.add %161, %160 : i64
+    llvm.br ^bb13(%203 : i64)
+  ^bb24:  // pred: ^bb13
+    %204 = llvm.mlir.constant(3 : i64) : i64
+    %205 = llvm.extractvalue %60[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %206 = llvm.ptrtoint %205 : !llvm.ptr to i64
+    %207 = llvm.extractvalue %107[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %208 = llvm.ptrtoint %207 : !llvm.ptr to i64
+    %209 = llvm.extractvalue %90[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
+    %210 = llvm.ptrtoint %209 : !llvm.ptr to i64
+    %211 = llvm.extractvalue %77[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %212 = llvm.ptrtoint %211 : !llvm.ptr to i64
+    %213 = llvm.mlir.constant(64 : i64) : i64
+    %214 = llvm.mlir.constant(2 : i64) : i64
+    %215 = llvm.mlir.constant(4575657221408424000 : i64) : i64
+    "gemmini.intr.config_st"(%214, %215) : (i64, i64) -> ()
+    %216 = llvm.mlir.constant(65540 : i64) : i64
+    %217 = llvm.mlir.constant(281474976710656 : i64) : i64
+    "gemmini.intr.config_ex"(%216, %217) : (i64, i64) -> ()
+    %218 = llvm.mlir.constant(0 : i64) : i64
+    %219 = llvm.mlir.constant(0 : i64) : i64
+    %220 = llvm.mlir.constant(0 : i64) : i64
+    %221 = llvm.mlir.constant(0 : i64) : i64
+    %222 = llvm.mlir.constant(18014411396481025 : i64) : i64
+    %223 = llvm.mlir.constant(4296933406 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config1"(%222, %223) : (i64, i64) -> ()
+    %224 = llvm.mlir.constant(844429225164800 : i64) : i64
+    %225 = llvm.mlir.constant(281569467498512 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config2"(%224, %225) : (i64, i64) -> ()
+    %226 = llvm.mlir.constant(844437815230464 : i64) : i64
+    %227 = llvm.mlir.constant(0 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config3"(%226, %227) : (i64, i64) -> ()
+    %228 = llvm.mlir.constant(6192449487634432 : i64) : i64
+    %229 = llvm.mlir.constant(65559 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config4"(%228, %229) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config5"(%212, %208) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config6"(%210, %206) : (i64, i64) -> ()
+    %230 = llvm.mlir.constant(768 : i64) : i64
+    %231 = llvm.mlir.constant(1 : i64) : i64
+    "gemmini.intr.loop_conv_ws"(%230, %231) : (i64, i64) -> ()
+    %232 = llvm.mlir.constant(16 : i64) : i64
+    %233 = llvm.add %208, %232 : i64
+    %234 = llvm.mlir.constant(64 : i64) : i64
+    %235 = llvm.add %210, %234 : i64
+    %236 = llvm.mlir.constant(16 : i64) : i64
+    %237 = llvm.add %212, %236 : i64
+    %238 = llvm.mlir.constant(0 : i64) : i64
+    %239 = llvm.mlir.constant(18014411396481025 : i64) : i64
+    %240 = llvm.mlir.constant(4296933406 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config1"(%239, %240) : (i64, i64) -> ()
+    %241 = llvm.mlir.constant(844429225164800 : i64) : i64
+    %242 = llvm.mlir.constant(281569467498512 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config2"(%241, %242) : (i64, i64) -> ()
+    %243 = llvm.mlir.constant(844437815230464 : i64) : i64
+    %244 = llvm.mlir.constant(0 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config3"(%243, %244) : (i64, i64) -> ()
+    %245 = llvm.mlir.constant(6192449487634432 : i64) : i64
+    %246 = llvm.mlir.constant(65559 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config4"(%245, %246) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config5"(%237, %233) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config6"(%235, %206) : (i64, i64) -> ()
+    %247 = llvm.mlir.constant(768 : i64) : i64
+    %248 = llvm.mlir.constant(1 : i64) : i64
+    "gemmini.intr.loop_conv_ws"(%247, %248) : (i64, i64) -> ()
+    %249 = llvm.mlir.constant(32 : i64) : i64
+    %250 = llvm.add %208, %249 : i64
+    %251 = llvm.mlir.constant(128 : i64) : i64
+    %252 = llvm.add %210, %251 : i64
+    %253 = llvm.mlir.constant(32 : i64) : i64
+    %254 = llvm.add %212, %253 : i64
+    %255 = llvm.mlir.constant(0 : i64) : i64
+    %256 = llvm.mlir.constant(18014411396481025 : i64) : i64
+    %257 = llvm.mlir.constant(4296933406 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config1"(%256, %257) : (i64, i64) -> ()
+    %258 = llvm.mlir.constant(844429225164800 : i64) : i64
+    %259 = llvm.mlir.constant(281569467498512 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config2"(%258, %259) : (i64, i64) -> ()
+    %260 = llvm.mlir.constant(844437815230464 : i64) : i64
+    %261 = llvm.mlir.constant(0 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config3"(%260, %261) : (i64, i64) -> ()
+    %262 = llvm.mlir.constant(6192449487634432 : i64) : i64
+    %263 = llvm.mlir.constant(65559 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config4"(%262, %263) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config5"(%254, %250) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config6"(%252, %206) : (i64, i64) -> ()
+    %264 = llvm.mlir.constant(768 : i64) : i64
+    %265 = llvm.mlir.constant(1 : i64) : i64
+    "gemmini.intr.loop_conv_ws"(%264, %265) : (i64, i64) -> ()
+    %266 = llvm.mlir.constant(48 : i64) : i64
+    %267 = llvm.add %208, %266 : i64
+    %268 = llvm.mlir.constant(192 : i64) : i64
+    %269 = llvm.add %210, %268 : i64
+    %270 = llvm.mlir.constant(48 : i64) : i64
+    %271 = llvm.add %212, %270 : i64
+    %272 = llvm.mlir.constant(0 : i64) : i64
+    %273 = llvm.mlir.constant(18014411396481025 : i64) : i64
+    %274 = llvm.mlir.constant(4296933406 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config1"(%273, %274) : (i64, i64) -> ()
+    %275 = llvm.mlir.constant(844429225164800 : i64) : i64
+    %276 = llvm.mlir.constant(281569467498512 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config2"(%275, %276) : (i64, i64) -> ()
+    %277 = llvm.mlir.constant(844437815230464 : i64) : i64
+    %278 = llvm.mlir.constant(0 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config3"(%277, %278) : (i64, i64) -> ()
+    %279 = llvm.mlir.constant(6192449487634432 : i64) : i64
+    %280 = llvm.mlir.constant(65559 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config4"(%279, %280) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config5"(%271, %267) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config6"(%269, %206) : (i64, i64) -> ()
+    %281 = llvm.mlir.constant(768 : i64) : i64
+    %282 = llvm.mlir.constant(1 : i64) : i64
+    "gemmini.intr.loop_conv_ws"(%281, %282) : (i64, i64) -> ()
+    %283 = llvm.mlir.constant(1472 : i64) : i64
+    %284 = llvm.add %208, %283 : i64
+    %285 = llvm.mlir.constant(0 : i64) : i64
+    %286 = llvm.mlir.constant(0 : i64) : i64
+    %287 = llvm.mlir.constant(69 : i64) : i64
+    %288 = llvm.add %206, %287 : i64
+    %289 = llvm.mlir.constant(18014411396481025 : i64) : i64
+    %290 = llvm.mlir.constant(4296933406 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config1"(%289, %290) : (i64, i64) -> ()
+    %291 = llvm.mlir.constant(844429225164800 : i64) : i64
+    %292 = llvm.mlir.constant(281569466449936 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config2"(%291, %292) : (i64, i64) -> ()
+    %293 = llvm.mlir.constant(844437815230464 : i64) : i64
+    %294 = llvm.mlir.constant(0 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config3"(%293, %294) : (i64, i64) -> ()
+    %295 = llvm.mlir.constant(6192449487634432 : i64) : i64
+    %296 = llvm.mlir.constant(65543 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config4"(%295, %296) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config5"(%212, %284) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config6"(%210, %288) : (i64, i64) -> ()
+    %297 = llvm.mlir.constant(768 : i64) : i64
+    %298 = llvm.mlir.constant(1 : i64) : i64
+    "gemmini.intr.loop_conv_ws"(%297, %298) : (i64, i64) -> ()
+    %299 = llvm.mlir.constant(1488 : i64) : i64
+    %300 = llvm.add %208, %299 : i64
+    %301 = llvm.mlir.constant(64 : i64) : i64
+    %302 = llvm.add %210, %301 : i64
+    %303 = llvm.mlir.constant(16 : i64) : i64
+    %304 = llvm.add %212, %303 : i64
+    %305 = llvm.mlir.constant(69 : i64) : i64
+    %306 = llvm.add %206, %305 : i64
+    %307 = llvm.mlir.constant(18014411396481025 : i64) : i64
+    %308 = llvm.mlir.constant(4296933406 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config1"(%307, %308) : (i64, i64) -> ()
+    %309 = llvm.mlir.constant(844429225164800 : i64) : i64
+    %310 = llvm.mlir.constant(281569466449936 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config2"(%309, %310) : (i64, i64) -> ()
+    %311 = llvm.mlir.constant(844437815230464 : i64) : i64
+    %312 = llvm.mlir.constant(0 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config3"(%311, %312) : (i64, i64) -> ()
+    %313 = llvm.mlir.constant(6192449487634432 : i64) : i64
+    %314 = llvm.mlir.constant(65543 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config4"(%313, %314) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config5"(%304, %300) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config6"(%302, %306) : (i64, i64) -> ()
+    %315 = llvm.mlir.constant(768 : i64) : i64
+    %316 = llvm.mlir.constant(1 : i64) : i64
+    "gemmini.intr.loop_conv_ws"(%315, %316) : (i64, i64) -> ()
+    %317 = llvm.mlir.constant(1504 : i64) : i64
+    %318 = llvm.add %208, %317 : i64
+    %319 = llvm.mlir.constant(128 : i64) : i64
+    %320 = llvm.add %210, %319 : i64
+    %321 = llvm.mlir.constant(32 : i64) : i64
+    %322 = llvm.add %212, %321 : i64
+    %323 = llvm.mlir.constant(69 : i64) : i64
+    %324 = llvm.add %206, %323 : i64
+    %325 = llvm.mlir.constant(18014411396481025 : i64) : i64
+    %326 = llvm.mlir.constant(4296933406 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config1"(%325, %326) : (i64, i64) -> ()
+    %327 = llvm.mlir.constant(844429225164800 : i64) : i64
+    %328 = llvm.mlir.constant(281569466449936 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config2"(%327, %328) : (i64, i64) -> ()
+    %329 = llvm.mlir.constant(844437815230464 : i64) : i64
+    %330 = llvm.mlir.constant(0 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config3"(%329, %330) : (i64, i64) -> ()
+    %331 = llvm.mlir.constant(6192449487634432 : i64) : i64
+    %332 = llvm.mlir.constant(65543 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config4"(%331, %332) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config5"(%322, %318) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config6"(%320, %324) : (i64, i64) -> ()
+    %333 = llvm.mlir.constant(768 : i64) : i64
+    %334 = llvm.mlir.constant(1 : i64) : i64
+    "gemmini.intr.loop_conv_ws"(%333, %334) : (i64, i64) -> ()
+    %335 = llvm.mlir.constant(1520 : i64) : i64
+    %336 = llvm.add %208, %335 : i64
+    %337 = llvm.mlir.constant(192 : i64) : i64
+    %338 = llvm.add %210, %337 : i64
+    %339 = llvm.mlir.constant(48 : i64) : i64
+    %340 = llvm.add %212, %339 : i64
+    %341 = llvm.mlir.constant(69 : i64) : i64
+    %342 = llvm.add %206, %341 : i64
+    %343 = llvm.mlir.constant(18014411396481025 : i64) : i64
+    %344 = llvm.mlir.constant(4296933406 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config1"(%343, %344) : (i64, i64) -> ()
+    %345 = llvm.mlir.constant(844429225164800 : i64) : i64
+    %346 = llvm.mlir.constant(281569466449936 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config2"(%345, %346) : (i64, i64) -> ()
+    %347 = llvm.mlir.constant(844437815230464 : i64) : i64
+    %348 = llvm.mlir.constant(0 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config3"(%347, %348) : (i64, i64) -> ()
+    %349 = llvm.mlir.constant(6192449487634432 : i64) : i64
+    %350 = llvm.mlir.constant(65543 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config4"(%349, %350) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config5"(%340, %336) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config6"(%338, %342) : (i64, i64) -> ()
+    %351 = llvm.mlir.constant(768 : i64) : i64
+    %352 = llvm.mlir.constant(1 : i64) : i64
+    "gemmini.intr.loop_conv_ws"(%351, %352) : (i64, i64) -> ()
+    %353 = llvm.mlir.constant(42240 : i64) : i64
+    %354 = llvm.add %208, %353 : i64
+    %355 = llvm.mlir.constant(0 : i64) : i64
+    %356 = llvm.mlir.constant(0 : i64) : i64
+    %357 = llvm.mlir.constant(2112 : i64) : i64
+    %358 = llvm.add %206, %357 : i64
+    %359 = llvm.mlir.constant(18014411396481025 : i64) : i64
+    %360 = llvm.mlir.constant(4296933406 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config1"(%359, %360) : (i64, i64) -> ()
+    %361 = llvm.mlir.constant(844429225164800 : i64) : i64
+    %362 = llvm.mlir.constant(281509337956368 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config2"(%361, %362) : (i64, i64) -> ()
+    %363 = llvm.mlir.constant(844437815230464 : i64) : i64
+    %364 = llvm.mlir.constant(0 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config3"(%363, %364) : (i64, i64) -> ()
+    %365 = llvm.mlir.constant(2251799813685248 : i64) : i64
+    %366 = llvm.mlir.constant(65559 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config4"(%365, %366) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config5"(%212, %354) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config6"(%210, %358) : (i64, i64) -> ()
+    %367 = llvm.mlir.constant(768 : i64) : i64
+    %368 = llvm.mlir.constant(1 : i64) : i64
+    "gemmini.intr.loop_conv_ws"(%367, %368) : (i64, i64) -> ()
+    %369 = llvm.mlir.constant(42256 : i64) : i64
+    %370 = llvm.add %208, %369 : i64
+    %371 = llvm.mlir.constant(64 : i64) : i64
+    %372 = llvm.add %210, %371 : i64
+    %373 = llvm.mlir.constant(16 : i64) : i64
+    %374 = llvm.add %212, %373 : i64
+    %375 = llvm.mlir.constant(2112 : i64) : i64
+    %376 = llvm.add %206, %375 : i64
+    %377 = llvm.mlir.constant(18014411396481025 : i64) : i64
+    %378 = llvm.mlir.constant(4296933406 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config1"(%377, %378) : (i64, i64) -> ()
+    %379 = llvm.mlir.constant(844429225164800 : i64) : i64
+    %380 = llvm.mlir.constant(281509337956368 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config2"(%379, %380) : (i64, i64) -> ()
+    %381 = llvm.mlir.constant(844437815230464 : i64) : i64
+    %382 = llvm.mlir.constant(0 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config3"(%381, %382) : (i64, i64) -> ()
+    %383 = llvm.mlir.constant(2251799813685248 : i64) : i64
+    %384 = llvm.mlir.constant(65559 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config4"(%383, %384) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config5"(%374, %370) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config6"(%372, %376) : (i64, i64) -> ()
+    %385 = llvm.mlir.constant(768 : i64) : i64
+    %386 = llvm.mlir.constant(1 : i64) : i64
+    "gemmini.intr.loop_conv_ws"(%385, %386) : (i64, i64) -> ()
+    %387 = llvm.mlir.constant(42272 : i64) : i64
+    %388 = llvm.add %208, %387 : i64
+    %389 = llvm.mlir.constant(128 : i64) : i64
+    %390 = llvm.add %210, %389 : i64
+    %391 = llvm.mlir.constant(32 : i64) : i64
+    %392 = llvm.add %212, %391 : i64
+    %393 = llvm.mlir.constant(2112 : i64) : i64
+    %394 = llvm.add %206, %393 : i64
+    %395 = llvm.mlir.constant(18014411396481025 : i64) : i64
+    %396 = llvm.mlir.constant(4296933406 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config1"(%395, %396) : (i64, i64) -> ()
+    %397 = llvm.mlir.constant(844429225164800 : i64) : i64
+    %398 = llvm.mlir.constant(281509337956368 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config2"(%397, %398) : (i64, i64) -> ()
+    %399 = llvm.mlir.constant(844437815230464 : i64) : i64
+    %400 = llvm.mlir.constant(0 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config3"(%399, %400) : (i64, i64) -> ()
+    %401 = llvm.mlir.constant(2251799813685248 : i64) : i64
+    %402 = llvm.mlir.constant(65559 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config4"(%401, %402) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config5"(%392, %388) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config6"(%390, %394) : (i64, i64) -> ()
+    %403 = llvm.mlir.constant(768 : i64) : i64
+    %404 = llvm.mlir.constant(1 : i64) : i64
+    "gemmini.intr.loop_conv_ws"(%403, %404) : (i64, i64) -> ()
+    %405 = llvm.mlir.constant(42288 : i64) : i64
+    %406 = llvm.add %208, %405 : i64
+    %407 = llvm.mlir.constant(192 : i64) : i64
+    %408 = llvm.add %210, %407 : i64
+    %409 = llvm.mlir.constant(48 : i64) : i64
+    %410 = llvm.add %212, %409 : i64
+    %411 = llvm.mlir.constant(2112 : i64) : i64
+    %412 = llvm.add %206, %411 : i64
+    %413 = llvm.mlir.constant(18014411396481025 : i64) : i64
+    %414 = llvm.mlir.constant(4296933406 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config1"(%413, %414) : (i64, i64) -> ()
+    %415 = llvm.mlir.constant(844429225164800 : i64) : i64
+    %416 = llvm.mlir.constant(281509337956368 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config2"(%415, %416) : (i64, i64) -> ()
+    %417 = llvm.mlir.constant(844437815230464 : i64) : i64
+    %418 = llvm.mlir.constant(0 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config3"(%417, %418) : (i64, i64) -> ()
+    %419 = llvm.mlir.constant(2251799813685248 : i64) : i64
+    %420 = llvm.mlir.constant(65559 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config4"(%419, %420) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config5"(%410, %406) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config6"(%408, %412) : (i64, i64) -> ()
+    %421 = llvm.mlir.constant(768 : i64) : i64
+    %422 = llvm.mlir.constant(1 : i64) : i64
+    "gemmini.intr.loop_conv_ws"(%421, %422) : (i64, i64) -> ()
+    %423 = llvm.mlir.constant(43712 : i64) : i64
+    %424 = llvm.add %208, %423 : i64
+    %425 = llvm.mlir.constant(0 : i64) : i64
+    %426 = llvm.mlir.constant(0 : i64) : i64
+    %427 = llvm.mlir.constant(2181 : i64) : i64
+    %428 = llvm.add %206, %427 : i64
+    %429 = llvm.mlir.constant(18014411396481025 : i64) : i64
+    %430 = llvm.mlir.constant(4296933406 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config1"(%429, %430) : (i64, i64) -> ()
+    %431 = llvm.mlir.constant(844429225164800 : i64) : i64
+    %432 = llvm.mlir.constant(281509336907792 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config2"(%431, %432) : (i64, i64) -> ()
+    %433 = llvm.mlir.constant(844437815230464 : i64) : i64
+    %434 = llvm.mlir.constant(0 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config3"(%433, %434) : (i64, i64) -> ()
+    %435 = llvm.mlir.constant(2251799813685248 : i64) : i64
+    %436 = llvm.mlir.constant(65543 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config4"(%435, %436) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config5"(%212, %424) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config6"(%210, %428) : (i64, i64) -> ()
+    %437 = llvm.mlir.constant(768 : i64) : i64
+    %438 = llvm.mlir.constant(1 : i64) : i64
+    "gemmini.intr.loop_conv_ws"(%437, %438) : (i64, i64) -> ()
+    %439 = llvm.mlir.constant(43728 : i64) : i64
+    %440 = llvm.add %208, %439 : i64
+    %441 = llvm.mlir.constant(64 : i64) : i64
+    %442 = llvm.add %210, %441 : i64
+    %443 = llvm.mlir.constant(16 : i64) : i64
+    %444 = llvm.add %212, %443 : i64
+    %445 = llvm.mlir.constant(2181 : i64) : i64
+    %446 = llvm.add %206, %445 : i64
+    %447 = llvm.mlir.constant(18014411396481025 : i64) : i64
+    %448 = llvm.mlir.constant(4296933406 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config1"(%447, %448) : (i64, i64) -> ()
+    %449 = llvm.mlir.constant(844429225164800 : i64) : i64
+    %450 = llvm.mlir.constant(281509336907792 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config2"(%449, %450) : (i64, i64) -> ()
+    %451 = llvm.mlir.constant(844437815230464 : i64) : i64
+    %452 = llvm.mlir.constant(0 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config3"(%451, %452) : (i64, i64) -> ()
+    %453 = llvm.mlir.constant(2251799813685248 : i64) : i64
+    %454 = llvm.mlir.constant(65543 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config4"(%453, %454) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config5"(%444, %440) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config6"(%442, %446) : (i64, i64) -> ()
+    %455 = llvm.mlir.constant(768 : i64) : i64
+    %456 = llvm.mlir.constant(1 : i64) : i64
+    "gemmini.intr.loop_conv_ws"(%455, %456) : (i64, i64) -> ()
+    %457 = llvm.mlir.constant(43744 : i64) : i64
+    %458 = llvm.add %208, %457 : i64
+    %459 = llvm.mlir.constant(128 : i64) : i64
+    %460 = llvm.add %210, %459 : i64
+    %461 = llvm.mlir.constant(32 : i64) : i64
+    %462 = llvm.add %212, %461 : i64
+    %463 = llvm.mlir.constant(2181 : i64) : i64
+    %464 = llvm.add %206, %463 : i64
+    %465 = llvm.mlir.constant(18014411396481025 : i64) : i64
+    %466 = llvm.mlir.constant(4296933406 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config1"(%465, %466) : (i64, i64) -> ()
+    %467 = llvm.mlir.constant(844429225164800 : i64) : i64
+    %468 = llvm.mlir.constant(281509336907792 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config2"(%467, %468) : (i64, i64) -> ()
+    %469 = llvm.mlir.constant(844437815230464 : i64) : i64
+    %470 = llvm.mlir.constant(0 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config3"(%469, %470) : (i64, i64) -> ()
+    %471 = llvm.mlir.constant(2251799813685248 : i64) : i64
+    %472 = llvm.mlir.constant(65543 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config4"(%471, %472) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config5"(%462, %458) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config6"(%460, %464) : (i64, i64) -> ()
+    %473 = llvm.mlir.constant(768 : i64) : i64
+    %474 = llvm.mlir.constant(1 : i64) : i64
+    "gemmini.intr.loop_conv_ws"(%473, %474) : (i64, i64) -> ()
+    %475 = llvm.mlir.constant(43760 : i64) : i64
+    %476 = llvm.add %208, %475 : i64
+    %477 = llvm.mlir.constant(192 : i64) : i64
+    %478 = llvm.add %210, %477 : i64
+    %479 = llvm.mlir.constant(48 : i64) : i64
+    %480 = llvm.add %212, %479 : i64
+    %481 = llvm.mlir.constant(2181 : i64) : i64
+    %482 = llvm.add %206, %481 : i64
+    %483 = llvm.mlir.constant(18014411396481025 : i64) : i64
+    %484 = llvm.mlir.constant(4296933406 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config1"(%483, %484) : (i64, i64) -> ()
+    %485 = llvm.mlir.constant(844429225164800 : i64) : i64
+    %486 = llvm.mlir.constant(281509336907792 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config2"(%485, %486) : (i64, i64) -> ()
+    %487 = llvm.mlir.constant(844437815230464 : i64) : i64
+    %488 = llvm.mlir.constant(0 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config3"(%487, %488) : (i64, i64) -> ()
+    %489 = llvm.mlir.constant(2251799813685248 : i64) : i64
+    %490 = llvm.mlir.constant(65543 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config4"(%489, %490) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config5"(%480, %476) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config6"(%478, %482) : (i64, i64) -> ()
+    %491 = llvm.mlir.constant(768 : i64) : i64
+    %492 = llvm.mlir.constant(1 : i64) : i64
+    "gemmini.intr.loop_conv_ws"(%491, %492) : (i64, i64) -> ()
+    %493 = llvm.mlir.constant(0 : i64) : i64
+    "gemmini.intr.flush"(%493, %493) : (i64, i64) -> ()
+    %494 = llvm.mlir.constant(0 : index) : i64
+    %495 = llvm.mlir.constant(1 : index) : i64
+    %496 = llvm.mlir.constant(1 : index) : i64
+    llvm.br ^bb25(%494 : i64)
+  ^bb25(%497: i64):  // 2 preds: ^bb24, ^bb35
+    %498 = llvm.icmp "slt" %497, %495 : i64
+    llvm.cond_br %498, ^bb26, ^bb36
+  ^bb26:  // pred: ^bb25
+    %499 = llvm.mlir.constant(0 : index) : i64
+    %500 = llvm.mlir.constant(64 : index) : i64
+    %501 = llvm.mlir.constant(1 : index) : i64
+    llvm.br ^bb27(%499 : i64)
+  ^bb27(%502: i64):  // 2 preds: ^bb26, ^bb34
+    %503 = llvm.icmp "slt" %502, %500 : i64
+    llvm.cond_br %503, ^bb28, ^bb35
+  ^bb28:  // pred: ^bb27
+    %504 = llvm.mlir.constant(0 : index) : i64
+    %505 = llvm.mlir.constant(30 : index) : i64
+    %506 = llvm.mlir.constant(1 : index) : i64
+    llvm.br ^bb29(%504 : i64)
+  ^bb29(%507: i64):  // 2 preds: ^bb28, ^bb33
+    %508 = llvm.icmp "slt" %507, %505 : i64
+    llvm.cond_br %508, ^bb30, ^bb34
+  ^bb30:  // pred: ^bb29
+    %509 = llvm.mlir.constant(0 : index) : i64
+    %510 = llvm.mlir.constant(30 : index) : i64
+    %511 = llvm.mlir.constant(1 : index) : i64
+    llvm.br ^bb31(%509 : i64)
+  ^bb31(%512: i64):  // 2 preds: ^bb30, ^bb32
+    %513 = llvm.icmp "slt" %512, %510 : i64
+    llvm.cond_br %513, ^bb32, ^bb33
+  ^bb32:  // pred: ^bb31
+    %514 = llvm.mlir.constant(30 : index) : i64
+    %515 = llvm.mul %497, %514 : i64
+    %516 = llvm.mul %515, %514 : i64
+    %517 = llvm.mul %507, %514 : i64
+    %518 = llvm.add %516, %517 : i64
+    %519 = llvm.add %518, %512 : i64
+    %520 = llvm.extractvalue %107[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %521 = llvm.mlir.constant(64 : index) : i64
+    %522 = llvm.mul %519, %521 : i64
+    %523 = llvm.add %522, %502 : i64
+    %524 = llvm.getelementptr %520[%523] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+    %525 = llvm.load %524 : !llvm.ptr -> f32
+    %526 = llvm.extractvalue %11[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %527 = llvm.mlir.constant(57600 : index) : i64
+    %528 = llvm.mul %497, %527 : i64
+    %529 = llvm.mlir.constant(900 : index) : i64
+    %530 = llvm.mul %502, %529 : i64
+    %531 = llvm.add %528, %530 : i64
+    %532 = llvm.mlir.constant(30 : index) : i64
+    %533 = llvm.mul %507, %532 : i64
+    %534 = llvm.add %531, %533 : i64
+    %535 = llvm.add %534, %512 : i64
+    %536 = llvm.getelementptr %526[%535] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+    llvm.store %525, %536 : f32, !llvm.ptr
+    %537 = llvm.add %512, %511 : i64
+    llvm.br ^bb31(%537 : i64)
+  ^bb33:  // pred: ^bb31
+    %538 = llvm.add %507, %506 : i64
+    llvm.br ^bb29(%538 : i64)
+  ^bb34:  // pred: ^bb29
+    %539 = llvm.add %502, %501 : i64
+    llvm.br ^bb27(%539 : i64)
+  ^bb35:  // pred: ^bb27
+    %540 = llvm.add %497, %496 : i64
+    llvm.br ^bb25(%540 : i64)
+  ^bb36:  // pred: ^bb25
+    %541 = llvm.extractvalue %60[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    llvm.call @free(%541) : (!llvm.ptr) -> ()
+    %542 = llvm.extractvalue %77[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    llvm.call @free(%542) : (!llvm.ptr) -> ()
+    %543 = llvm.extractvalue %107[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    llvm.call @free(%543) : (!llvm.ptr) -> ()
+    %544 = llvm.extractvalue %90[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
+    llvm.call @free(%544) : (!llvm.ptr) -> ()
+    llvm.return
+  }
+}
+
+
diff --git a/experiments/gemmini/networks/conv2d_block_nchw.mlir b/experiments/gemmini/networks/conv2d_block_nchw.mlir
new file mode 100644
index 0000000..2df3f75
--- /dev/null
+++ b/experiments/gemmini/networks/conv2d_block_nchw.mlir
@@ -0,0 +1,17 @@
+module {
+  // NCHW input: [N, C, H, W] = [1, 3, 32, 32]
+  // FCHW filter: [F, C, KH, KW] = [64, 3, 3, 3]
+  // NCHW output: [1, 64, 30, 30] (no padding, stride 1)
+  func.func @conv2d_block_nchw(
+      %input:  memref<1x3x32x32xf32>,
+      %filter: memref<64x3x3x3xf32>,
+      %output: memref<1x64x30x30xf32>
+  ) {
+    linalg.conv_2d_nchw_fchw
+      ins(%input, %filter :
+          memref<1x3x32x32xf32>, memref<64x3x3x3xf32>)
+      outs(%output :
+          memref<1x64x30x30xf32>)
+    return
+  }
+}

From 4c3e9714526f64065ac065cec75ad9f966fd838c Mon Sep 17 00:00:00 2001
From: sparsh <sparshsingh@berkeley.edu>
Date: Wed, 7 Jan 2026 17:49:18 -0800
Subject: [PATCH 08/13] Buddy Gemmini: add mini CNN conv block lowering test

---
 .../logs/mini_cnn_block.print-after-all.mlir  | 1195 +++++++++++++++++
 .../gemmini/networks/mini_cnn_block.mlir      |   34 +
 2 files changed, 1229 insertions(+)
 create mode 100644 experiments/gemmini/logs/mini_cnn_block.print-after-all.mlir
 create mode 100644 experiments/gemmini/networks/mini_cnn_block.mlir

diff --git a/experiments/gemmini/logs/mini_cnn_block.print-after-all.mlir b/experiments/gemmini/logs/mini_cnn_block.print-after-all.mlir
new file mode 100644
index 0000000..0956689
--- /dev/null
+++ b/experiments/gemmini/logs/mini_cnn_block.print-after-all.mlir
@@ -0,0 +1,1195 @@
+// -----// IR Dump After (anonymous namespace)::LowerLinalgToGemminiPass (convert-linalg-to-gemmini) //----- //
+module {
+  func.func @mini_cnn_block(%arg0: memref<1x3x32x32xf32>, %arg1: memref<16x3x3x3xf32>, %arg2: memref<32x16x3x3xf32>, %arg3: memref<1x32x26x26xf32>) {
+    %alloc = memref.alloc() : memref<1x16x30x30xf32>
+    %alloc_0 = memref.alloc() : memref<1x32x26x26xf32>
+    %alloc_1 = memref.alloc() : memref<1x32x32x3xf32>
+    %alloc_2 = memref.alloc() : memref<27x16xf32>
+    %alloc_3 = memref.alloc() : memref<16xi32>
+    %alloc_4 = memref.alloc() : memref<900x16xf32>
+    %c30_i64 = arith.constant 30 : i64
+    %c3 = arith.constant 3 : index
+    %c3_5 = arith.constant 3 : index
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c1_6 = arith.constant 1 : index
+    scf.for %arg4 = %c0 to %c1 step %c1_6 {
+      %c0_27 = arith.constant 0 : index
+      %c3_28 = arith.constant 3 : index
+      %c1_29 = arith.constant 1 : index
+      scf.for %arg5 = %c0_27 to %c3_28 step %c1_29 {
+        %c0_30 = arith.constant 0 : index
+        %c32_31 = arith.constant 32 : index
+        %c1_32 = arith.constant 1 : index
+        scf.for %arg6 = %c0_30 to %c32_31 step %c1_32 {
+          %c0_33 = arith.constant 0 : index
+          %c32_34 = arith.constant 32 : index
+          %c1_35 = arith.constant 1 : index
+          scf.for %arg7 = %c0_33 to %c32_34 step %c1_35 {
+            %0 = memref.load %arg0[%arg4, %arg5, %arg6, %arg7] : memref<1x3x32x32xf32>
+            memref.store %0, %alloc_1[%arg4, %arg6, %arg7, %arg5] : memref<1x32x32x3xf32>
+          }
+        }
+      }
+    }
+    %c0_7 = arith.constant 0 : index
+    %c16 = arith.constant 16 : index
+    %c1_8 = arith.constant 1 : index
+    scf.for %arg4 = %c0_7 to %c16 step %c1_8 {
+      %c0_27 = arith.constant 0 : index
+      %c3_28 = arith.constant 3 : index
+      %c1_29 = arith.constant 1 : index
+      scf.for %arg5 = %c0_27 to %c3_28 step %c1_29 {
+        %c0_30 = arith.constant 0 : index
+        %c3_31 = arith.constant 3 : index
+        %c1_32 = arith.constant 1 : index
+        scf.for %arg6 = %c0_30 to %c3_31 step %c1_32 {
+          %c0_33 = arith.constant 0 : index
+          %c3_34 = arith.constant 3 : index
+          %c1_35 = arith.constant 1 : index
+          scf.for %arg7 = %c0_33 to %c3_34 step %c1_35 {
+            %0 = arith.muli %arg6, %c3 : index
+            %1 = arith.muli %0, %c3_5 : index
+            %2 = arith.muli %arg7, %c3_5 : index
+            %3 = arith.addi %1, %2 : index
+            %4 = arith.addi %3, %arg5 : index
+            %5 = memref.load %arg1[%arg4, %arg5, %arg6, %arg7] : memref<16x3x3x3xf32>
+            memref.store %5, %alloc_2[%4, %arg4] : memref<27x16xf32>
+          }
+        }
+      }
+    }
+    %c3_i64 = arith.constant 3 : i64
+    gemmini.tile_conv %alloc_1 %alloc_2 %alloc_3 %alloc_4 %c30_i64 %c30_i64 %c3_i64 : memref<1x32x32x3xf32> memref<27x16xf32> memref<16xi32> memref<900x16xf32> i64 i64 i64
+    %c0_9 = arith.constant 0 : index
+    %c1_10 = arith.constant 1 : index
+    %c1_11 = arith.constant 1 : index
+    scf.for %arg4 = %c0_9 to %c1_10 step %c1_11 {
+      %c0_27 = arith.constant 0 : index
+      %c16_28 = arith.constant 16 : index
+      %c1_29 = arith.constant 1 : index
+      scf.for %arg5 = %c0_27 to %c16_28 step %c1_29 {
+        %c0_30 = arith.constant 0 : index
+        %c30 = arith.constant 30 : index
+        %c1_31 = arith.constant 1 : index
+        scf.for %arg6 = %c0_30 to %c30 step %c1_31 {
+          %c0_32 = arith.constant 0 : index
+          %c30_33 = arith.constant 30 : index
+          %c1_34 = arith.constant 1 : index
+          scf.for %arg7 = %c0_32 to %c30_33 step %c1_34 {
+            %c30_35 = arith.constant 30 : index
+            %0 = arith.muli %arg4, %c30_35 : index
+            %1 = arith.muli %0, %c30_35 : index
+            %2 = arith.muli %arg6, %c30_35 : index
+            %3 = arith.addi %1, %2 : index
+            %4 = arith.addi %3, %arg7 : index
+            %5 = memref.load %alloc_4[%4, %arg5] : memref<900x16xf32>
+            memref.store %5, %alloc[%arg4, %arg5, %arg6, %arg7] : memref<1x16x30x30xf32>
+          }
+        }
+      }
+    }
+    memref.dealloc %alloc_1 : memref<1x32x32x3xf32>
+    memref.dealloc %alloc_2 : memref<27x16xf32>
+    memref.dealloc %alloc_4 : memref<900x16xf32>
+    memref.dealloc %alloc_3 : memref<16xi32>
+    %alloc_12 = memref.alloc() : memref<1x30x30x16xf32>
+    %alloc_13 = memref.alloc() : memref<144x32xf32>
+    %alloc_14 = memref.alloc() : memref<32xi32>
+    %alloc_15 = memref.alloc() : memref<676x32xf32>
+    %c26_i64 = arith.constant 26 : i64
+    %c3_16 = arith.constant 3 : index
+    %c16_17 = arith.constant 16 : index
+    %c0_18 = arith.constant 0 : index
+    %c1_19 = arith.constant 1 : index
+    %c1_20 = arith.constant 1 : index
+    scf.for %arg4 = %c0_18 to %c1_19 step %c1_20 {
+      %c0_27 = arith.constant 0 : index
+      %c16_28 = arith.constant 16 : index
+      %c1_29 = arith.constant 1 : index
+      scf.for %arg5 = %c0_27 to %c16_28 step %c1_29 {
+        %c0_30 = arith.constant 0 : index
+        %c30 = arith.constant 30 : index
+        %c1_31 = arith.constant 1 : index
+        scf.for %arg6 = %c0_30 to %c30 step %c1_31 {
+          %c0_32 = arith.constant 0 : index
+          %c30_33 = arith.constant 30 : index
+          %c1_34 = arith.constant 1 : index
+          scf.for %arg7 = %c0_32 to %c30_33 step %c1_34 {
+            %0 = memref.load %alloc[%arg4, %arg5, %arg6, %arg7] : memref<1x16x30x30xf32>
+            memref.store %0, %alloc_12[%arg4, %arg6, %arg7, %arg5] : memref<1x30x30x16xf32>
+          }
+        }
+      }
+    }
+    %c0_21 = arith.constant 0 : index
+    %c32 = arith.constant 32 : index
+    %c1_22 = arith.constant 1 : index
+    scf.for %arg4 = %c0_21 to %c32 step %c1_22 {
+      %c0_27 = arith.constant 0 : index
+      %c16_28 = arith.constant 16 : index
+      %c1_29 = arith.constant 1 : index
+      scf.for %arg5 = %c0_27 to %c16_28 step %c1_29 {
+        %c0_30 = arith.constant 0 : index
+        %c3_31 = arith.constant 3 : index
+        %c1_32 = arith.constant 1 : index
+        scf.for %arg6 = %c0_30 to %c3_31 step %c1_32 {
+          %c0_33 = arith.constant 0 : index
+          %c3_34 = arith.constant 3 : index
+          %c1_35 = arith.constant 1 : index
+          scf.for %arg7 = %c0_33 to %c3_34 step %c1_35 {
+            %0 = arith.muli %arg6, %c3_16 : index
+            %1 = arith.muli %0, %c16_17 : index
+            %2 = arith.muli %arg7, %c16_17 : index
+            %3 = arith.addi %1, %2 : index
+            %4 = arith.addi %3, %arg5 : index
+            %5 = memref.load %arg2[%arg4, %arg5, %arg6, %arg7] : memref<32x16x3x3xf32>
+            memref.store %5, %alloc_13[%4, %arg4] : memref<144x32xf32>
+          }
+        }
+      }
+    }
+    %c3_i64_23 = arith.constant 3 : i64
+    gemmini.tile_conv %alloc_12 %alloc_13 %alloc_14 %alloc_15 %c26_i64 %c26_i64 %c3_i64_23 : memref<1x30x30x16xf32> memref<144x32xf32> memref<32xi32> memref<676x32xf32> i64 i64 i64
+    %c0_24 = arith.constant 0 : index
+    %c1_25 = arith.constant 1 : index
+    %c1_26 = arith.constant 1 : index
+    scf.for %arg4 = %c0_24 to %c1_25 step %c1_26 {
+      %c0_27 = arith.constant 0 : index
+      %c32_28 = arith.constant 32 : index
+      %c1_29 = arith.constant 1 : index
+      scf.for %arg5 = %c0_27 to %c32_28 step %c1_29 {
+        %c0_30 = arith.constant 0 : index
+        %c26 = arith.constant 26 : index
+        %c1_31 = arith.constant 1 : index
+        scf.for %arg6 = %c0_30 to %c26 step %c1_31 {
+          %c0_32 = arith.constant 0 : index
+          %c26_33 = arith.constant 26 : index
+          %c1_34 = arith.constant 1 : index
+          scf.for %arg7 = %c0_32 to %c26_33 step %c1_34 {
+            %c26_35 = arith.constant 26 : index
+            %0 = arith.muli %arg4, %c26_35 : index
+            %1 = arith.muli %0, %c26_35 : index
+            %2 = arith.muli %arg6, %c26_35 : index
+            %3 = arith.addi %1, %2 : index
+            %4 = arith.addi %3, %arg7 : index
+            %5 = memref.load %alloc_15[%4, %arg5] : memref<676x32xf32>
+            memref.store %5, %alloc_0[%arg4, %arg5, %arg6, %arg7] : memref<1x32x26x26xf32>
+          }
+        }
+      }
+    }
+    memref.dealloc %alloc_12 : memref<1x30x30x16xf32>
+    memref.dealloc %alloc_13 : memref<144x32xf32>
+    memref.dealloc %alloc_15 : memref<676x32xf32>
+    memref.dealloc %alloc_14 : memref<32xi32>
+    linalg.copy ins(%alloc_0 : memref<1x32x26x26xf32>) outs(%arg3 : memref<1x32x26x26xf32>)
+    memref.dealloc %alloc : memref<1x16x30x30xf32>
+    memref.dealloc %alloc_0 : memref<1x32x26x26xf32>
+    return
+  }
+}
+
+
+// -----// IR Dump After (anonymous namespace)::LowerGemminiToLLVMPass (lower-gemmini) //----- //
+module {
+  llvm.func @free(!llvm.ptr)
+  llvm.func @malloc(i64) -> !llvm.ptr
+  llvm.func @mini_cnn_block(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr, %arg23: !llvm.ptr, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64, %arg33: !llvm.ptr, %arg34: !llvm.ptr, %arg35: i64, %arg36: i64, %arg37: i64, %arg38: i64, %arg39: i64, %arg40: i64, %arg41: i64, %arg42: i64, %arg43: i64) {
+    %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
+    %1 = llvm.insertvalue %arg33, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %2 = llvm.insertvalue %arg34, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %3 = llvm.insertvalue %arg35, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %4 = llvm.insertvalue %arg36, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %5 = llvm.insertvalue %arg40, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %6 = llvm.insertvalue %arg37, %5[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %7 = llvm.insertvalue %arg41, %6[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %8 = llvm.insertvalue %arg38, %7[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %9 = llvm.insertvalue %arg42, %8[4, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %10 = llvm.insertvalue %arg39, %9[3, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %11 = llvm.insertvalue %arg43, %10[4, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %12 = builtin.unrealized_conversion_cast %11 : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> to memref<1x32x26x26xf32>
+    %13 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
+    %14 = llvm.insertvalue %arg22, %13[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %15 = llvm.insertvalue %arg23, %14[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %16 = llvm.insertvalue %arg24, %15[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %17 = llvm.insertvalue %arg25, %16[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %18 = llvm.insertvalue %arg29, %17[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %19 = llvm.insertvalue %arg26, %18[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %20 = llvm.insertvalue %arg30, %19[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %21 = llvm.insertvalue %arg27, %20[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %22 = llvm.insertvalue %arg31, %21[4, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %23 = llvm.insertvalue %arg28, %22[3, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %24 = llvm.insertvalue %arg32, %23[4, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %25 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
+    %26 = llvm.insertvalue %arg11, %25[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %27 = llvm.insertvalue %arg12, %26[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %28 = llvm.insertvalue %arg13, %27[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %29 = llvm.insertvalue %arg14, %28[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %30 = llvm.insertvalue %arg18, %29[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %31 = llvm.insertvalue %arg15, %30[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %32 = llvm.insertvalue %arg19, %31[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %33 = llvm.insertvalue %arg16, %32[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %34 = llvm.insertvalue %arg20, %33[4, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %35 = llvm.insertvalue %arg17, %34[3, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %36 = llvm.insertvalue %arg21, %35[4, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %37 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
+    %38 = llvm.insertvalue %arg0, %37[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %39 = llvm.insertvalue %arg1, %38[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %40 = llvm.insertvalue %arg2, %39[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %41 = llvm.insertvalue %arg3, %40[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %42 = llvm.insertvalue %arg7, %41[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %43 = llvm.insertvalue %arg4, %42[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %44 = llvm.insertvalue %arg8, %43[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %45 = llvm.insertvalue %arg5, %44[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %46 = llvm.insertvalue %arg9, %45[4, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %47 = llvm.insertvalue %arg6, %46[3, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %48 = llvm.insertvalue %arg10, %47[4, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %49 = llvm.mlir.constant(1 : index) : i64
+    %50 = llvm.mlir.constant(16 : index) : i64
+    %51 = llvm.mlir.constant(30 : index) : i64
+    %52 = llvm.mlir.constant(30 : index) : i64
+    %53 = llvm.mlir.constant(1 : index) : i64
+    %54 = llvm.mlir.constant(900 : index) : i64
+    %55 = llvm.mlir.constant(14400 : index) : i64
+    %56 = llvm.mlir.constant(14400 : index) : i64
+    %57 = llvm.mlir.zero : !llvm.ptr
+    %58 = llvm.getelementptr %57[%56] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+    %59 = llvm.ptrtoint %58 : !llvm.ptr to i64
+    %60 = llvm.call @malloc(%59) : (i64) -> !llvm.ptr
+    %61 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
+    %62 = llvm.insertvalue %60, %61[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %63 = llvm.insertvalue %60, %62[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %64 = llvm.mlir.constant(0 : index) : i64
+    %65 = llvm.insertvalue %64, %63[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %66 = llvm.insertvalue %49, %65[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %67 = llvm.insertvalue %50, %66[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %68 = llvm.insertvalue %51, %67[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %69 = llvm.insertvalue %52, %68[3, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %70 = llvm.insertvalue %55, %69[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %71 = llvm.insertvalue %54, %70[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %72 = llvm.insertvalue %52, %71[4, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %73 = llvm.insertvalue %53, %72[4, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %74 = llvm.mlir.constant(1 : index) : i64
+    %75 = llvm.mlir.constant(32 : index) : i64
+    %76 = llvm.mlir.constant(26 : index) : i64
+    %77 = llvm.mlir.constant(26 : index) : i64
+    %78 = llvm.mlir.constant(1 : index) : i64
+    %79 = llvm.mlir.constant(676 : index) : i64
+    %80 = llvm.mlir.constant(21632 : index) : i64
+    %81 = llvm.mlir.constant(21632 : index) : i64
+    %82 = llvm.mlir.zero : !llvm.ptr
+    %83 = llvm.getelementptr %82[%81] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+    %84 = llvm.ptrtoint %83 : !llvm.ptr to i64
+    %85 = llvm.call @malloc(%84) : (i64) -> !llvm.ptr
+    %86 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
+    %87 = llvm.insertvalue %85, %86[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %88 = llvm.insertvalue %85, %87[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %89 = llvm.mlir.constant(0 : index) : i64
+    %90 = llvm.insertvalue %89, %88[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %91 = llvm.insertvalue %74, %90[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %92 = llvm.insertvalue %75, %91[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %93 = llvm.insertvalue %76, %92[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %94 = llvm.insertvalue %77, %93[3, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %95 = llvm.insertvalue %80, %94[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %96 = llvm.insertvalue %79, %95[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %97 = llvm.insertvalue %77, %96[4, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %98 = llvm.insertvalue %78, %97[4, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %99 = builtin.unrealized_conversion_cast %98 : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> to memref<1x32x26x26xf32>
+    %100 = llvm.mlir.constant(1 : index) : i64
+    %101 = llvm.mlir.constant(32 : index) : i64
+    %102 = llvm.mlir.constant(32 : index) : i64
+    %103 = llvm.mlir.constant(3 : index) : i64
+    %104 = llvm.mlir.constant(1 : index) : i64
+    %105 = llvm.mlir.constant(96 : index) : i64
+    %106 = llvm.mlir.constant(3072 : index) : i64
+    %107 = llvm.mlir.constant(3072 : index) : i64
+    %108 = llvm.mlir.zero : !llvm.ptr
+    %109 = llvm.getelementptr %108[%107] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+    %110 = llvm.ptrtoint %109 : !llvm.ptr to i64
+    %111 = llvm.call @malloc(%110) : (i64) -> !llvm.ptr
+    %112 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
+    %113 = llvm.insertvalue %111, %112[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %114 = llvm.insertvalue %111, %113[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %115 = llvm.mlir.constant(0 : index) : i64
+    %116 = llvm.insertvalue %115, %114[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %117 = llvm.insertvalue %100, %116[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %118 = llvm.insertvalue %101, %117[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %119 = llvm.insertvalue %102, %118[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %120 = llvm.insertvalue %103, %119[3, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %121 = llvm.insertvalue %106, %120[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %122 = llvm.insertvalue %105, %121[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %123 = llvm.insertvalue %103, %122[4, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %124 = llvm.insertvalue %104, %123[4, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %125 = llvm.mlir.constant(27 : index) : i64
+    %126 = llvm.mlir.constant(16 : index) : i64
+    %127 = llvm.mlir.constant(1 : index) : i64
+    %128 = llvm.mlir.constant(432 : index) : i64
+    %129 = llvm.mlir.zero : !llvm.ptr
+    %130 = llvm.getelementptr %129[%128] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+    %131 = llvm.ptrtoint %130 : !llvm.ptr to i64
+    %132 = llvm.call @malloc(%131) : (i64) -> !llvm.ptr
+    %133 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+    %134 = llvm.insertvalue %132, %133[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %135 = llvm.insertvalue %132, %134[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %136 = llvm.mlir.constant(0 : index) : i64
+    %137 = llvm.insertvalue %136, %135[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %138 = llvm.insertvalue %125, %137[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %139 = llvm.insertvalue %126, %138[3, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %140 = llvm.insertvalue %126, %139[4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %141 = llvm.insertvalue %127, %140[4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %142 = llvm.mlir.constant(16 : index) : i64
+    %143 = llvm.mlir.constant(1 : index) : i64
+    %144 = llvm.mlir.zero : !llvm.ptr
+    %145 = llvm.getelementptr %144[%142] : (!llvm.ptr, i64) -> !llvm.ptr, i32
+    %146 = llvm.ptrtoint %145 : !llvm.ptr to i64
+    %147 = llvm.call @malloc(%146) : (i64) -> !llvm.ptr
+    %148 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+    %149 = llvm.insertvalue %147, %148[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
+    %150 = llvm.insertvalue %147, %149[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
+    %151 = llvm.mlir.constant(0 : index) : i64
+    %152 = llvm.insertvalue %151, %150[2] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
+    %153 = llvm.insertvalue %142, %152[3, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
+    %154 = llvm.insertvalue %143, %153[4, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
+    %155 = llvm.mlir.constant(900 : index) : i64
+    %156 = llvm.mlir.constant(16 : index) : i64
+    %157 = llvm.mlir.constant(1 : index) : i64
+    %158 = llvm.mlir.constant(14400 : index) : i64
+    %159 = llvm.mlir.zero : !llvm.ptr
+    %160 = llvm.getelementptr %159[%158] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+    %161 = llvm.ptrtoint %160 : !llvm.ptr to i64
+    %162 = llvm.call @malloc(%161) : (i64) -> !llvm.ptr
+    %163 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+    %164 = llvm.insertvalue %162, %163[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %165 = llvm.insertvalue %162, %164[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %166 = llvm.mlir.constant(0 : index) : i64
+    %167 = llvm.insertvalue %166, %165[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %168 = llvm.insertvalue %155, %167[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %169 = llvm.insertvalue %156, %168[3, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %170 = llvm.insertvalue %156, %169[4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %171 = llvm.insertvalue %157, %170[4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %172 = llvm.mlir.constant(30 : i64) : i64
+    %173 = llvm.mlir.constant(3 : index) : i64
+    %174 = llvm.mlir.constant(3 : index) : i64
+    %175 = llvm.mlir.constant(0 : index) : i64
+    %176 = llvm.mlir.constant(1 : index) : i64
+    %177 = llvm.mlir.constant(1 : index) : i64
+    llvm.br ^bb1(%175 : i64)
+  ^bb1(%178: i64):  // 2 preds: ^bb0, ^bb11
+    %179 = llvm.icmp "slt" %178, %176 : i64
+    llvm.cond_br %179, ^bb2, ^bb12
+  ^bb2:  // pred: ^bb1
+    %180 = llvm.mlir.constant(0 : index) : i64
+    %181 = llvm.mlir.constant(3 : index) : i64
+    %182 = llvm.mlir.constant(1 : index) : i64
+    llvm.br ^bb3(%180 : i64)
+  ^bb3(%183: i64):  // 2 preds: ^bb2, ^bb10
+    %184 = llvm.icmp "slt" %183, %181 : i64
+    llvm.cond_br %184, ^bb4, ^bb11
+  ^bb4:  // pred: ^bb3
+    %185 = llvm.mlir.constant(0 : index) : i64
+    %186 = llvm.mlir.constant(32 : index) : i64
+    %187 = llvm.mlir.constant(1 : index) : i64
+    llvm.br ^bb5(%185 : i64)
+  ^bb5(%188: i64):  // 2 preds: ^bb4, ^bb9
+    %189 = llvm.icmp "slt" %188, %186 : i64
+    llvm.cond_br %189, ^bb6, ^bb10
+  ^bb6:  // pred: ^bb5
+    %190 = llvm.mlir.constant(0 : index) : i64
+    %191 = llvm.mlir.constant(32 : index) : i64
+    %192 = llvm.mlir.constant(1 : index) : i64
+    llvm.br ^bb7(%190 : i64)
+  ^bb7(%193: i64):  // 2 preds: ^bb6, ^bb8
+    %194 = llvm.icmp "slt" %193, %191 : i64
+    llvm.cond_br %194, ^bb8, ^bb9
+  ^bb8:  // pred: ^bb7
+    %195 = llvm.extractvalue %48[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %196 = llvm.mlir.constant(3072 : index) : i64
+    %197 = llvm.mul %178, %196 : i64
+    %198 = llvm.mlir.constant(1024 : index) : i64
+    %199 = llvm.mul %183, %198 : i64
+    %200 = llvm.add %197, %199 : i64
+    %201 = llvm.mlir.constant(32 : index) : i64
+    %202 = llvm.mul %188, %201 : i64
+    %203 = llvm.add %200, %202 : i64
+    %204 = llvm.add %203, %193 : i64
+    %205 = llvm.getelementptr %195[%204] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+    %206 = llvm.load %205 : !llvm.ptr -> f32
+    %207 = llvm.extractvalue %124[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %208 = llvm.mlir.constant(3072 : index) : i64
+    %209 = llvm.mul %178, %208 : i64
+    %210 = llvm.mlir.constant(96 : index) : i64
+    %211 = llvm.mul %188, %210 : i64
+    %212 = llvm.add %209, %211 : i64
+    %213 = llvm.mlir.constant(3 : index) : i64
+    %214 = llvm.mul %193, %213 : i64
+    %215 = llvm.add %212, %214 : i64
+    %216 = llvm.add %215, %183 : i64
+    %217 = llvm.getelementptr %207[%216] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+    llvm.store %206, %217 : f32, !llvm.ptr
+    %218 = llvm.add %193, %192 : i64
+    llvm.br ^bb7(%218 : i64)
+  ^bb9:  // pred: ^bb7
+    %219 = llvm.add %188, %187 : i64
+    llvm.br ^bb5(%219 : i64)
+  ^bb10:  // pred: ^bb5
+    %220 = llvm.add %183, %182 : i64
+    llvm.br ^bb3(%220 : i64)
+  ^bb11:  // pred: ^bb3
+    %221 = llvm.add %178, %177 : i64
+    llvm.br ^bb1(%221 : i64)
+  ^bb12:  // pred: ^bb1
+    %222 = llvm.mlir.constant(0 : index) : i64
+    %223 = llvm.mlir.constant(16 : index) : i64
+    %224 = llvm.mlir.constant(1 : index) : i64
+    llvm.br ^bb13(%222 : i64)
+  ^bb13(%225: i64):  // 2 preds: ^bb12, ^bb23
+    %226 = llvm.icmp "slt" %225, %223 : i64
+    llvm.cond_br %226, ^bb14, ^bb24
+  ^bb14:  // pred: ^bb13
+    %227 = llvm.mlir.constant(0 : index) : i64
+    %228 = llvm.mlir.constant(3 : index) : i64
+    %229 = llvm.mlir.constant(1 : index) : i64
+    llvm.br ^bb15(%227 : i64)
+  ^bb15(%230: i64):  // 2 preds: ^bb14, ^bb22
+    %231 = llvm.icmp "slt" %230, %228 : i64
+    llvm.cond_br %231, ^bb16, ^bb23
+  ^bb16:  // pred: ^bb15
+    %232 = llvm.mlir.constant(0 : index) : i64
+    %233 = llvm.mlir.constant(3 : index) : i64
+    %234 = llvm.mlir.constant(1 : index) : i64
+    llvm.br ^bb17(%232 : i64)
+  ^bb17(%235: i64):  // 2 preds: ^bb16, ^bb21
+    %236 = llvm.icmp "slt" %235, %233 : i64
+    llvm.cond_br %236, ^bb18, ^bb22
+  ^bb18:  // pred: ^bb17
+    %237 = llvm.mlir.constant(0 : index) : i64
+    %238 = llvm.mlir.constant(3 : index) : i64
+    %239 = llvm.mlir.constant(1 : index) : i64
+    llvm.br ^bb19(%237 : i64)
+  ^bb19(%240: i64):  // 2 preds: ^bb18, ^bb20
+    %241 = llvm.icmp "slt" %240, %238 : i64
+    llvm.cond_br %241, ^bb20, ^bb21
+  ^bb20:  // pred: ^bb19
+    %242 = llvm.mul %235, %173 : i64
+    %243 = llvm.mul %242, %174 : i64
+    %244 = llvm.mul %240, %174 : i64
+    %245 = llvm.add %243, %244 : i64
+    %246 = llvm.add %245, %230 : i64
+    %247 = llvm.extractvalue %36[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %248 = llvm.mlir.constant(27 : index) : i64
+    %249 = llvm.mul %225, %248 : i64
+    %250 = llvm.mlir.constant(9 : index) : i64
+    %251 = llvm.mul %230, %250 : i64
+    %252 = llvm.add %249, %251 : i64
+    %253 = llvm.mlir.constant(3 : index) : i64
+    %254 = llvm.mul %235, %253 : i64
+    %255 = llvm.add %252, %254 : i64
+    %256 = llvm.add %255, %240 : i64
+    %257 = llvm.getelementptr %247[%256] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+    %258 = llvm.load %257 : !llvm.ptr -> f32
+    %259 = llvm.extractvalue %141[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %260 = llvm.mlir.constant(16 : index) : i64
+    %261 = llvm.mul %246, %260 : i64
+    %262 = llvm.add %261, %225 : i64
+    %263 = llvm.getelementptr %259[%262] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+    llvm.store %258, %263 : f32, !llvm.ptr
+    %264 = llvm.add %240, %239 : i64
+    llvm.br ^bb19(%264 : i64)
+  ^bb21:  // pred: ^bb19
+    %265 = llvm.add %235, %234 : i64
+    llvm.br ^bb17(%265 : i64)
+  ^bb22:  // pred: ^bb17
+    %266 = llvm.add %230, %229 : i64
+    llvm.br ^bb15(%266 : i64)
+  ^bb23:  // pred: ^bb15
+    %267 = llvm.add %225, %224 : i64
+    llvm.br ^bb13(%267 : i64)
+  ^bb24:  // pred: ^bb13
+    %268 = llvm.mlir.constant(3 : i64) : i64
+    %269 = llvm.extractvalue %124[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %270 = llvm.ptrtoint %269 : !llvm.ptr to i64
+    %271 = llvm.extractvalue %171[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %272 = llvm.ptrtoint %271 : !llvm.ptr to i64
+    %273 = llvm.extractvalue %154[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
+    %274 = llvm.ptrtoint %273 : !llvm.ptr to i64
+    %275 = llvm.extractvalue %141[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %276 = llvm.ptrtoint %275 : !llvm.ptr to i64
+    %277 = llvm.mlir.constant(16 : i64) : i64
+    %278 = llvm.mlir.constant(2 : i64) : i64
+    %279 = llvm.mlir.constant(4575657221408423952 : i64) : i64
+    "gemmini.intr.config_st"(%278, %279) : (i64, i64) -> ()
+    %280 = llvm.mlir.constant(65540 : i64) : i64
+    %281 = llvm.mlir.constant(281474976710656 : i64) : i64
+    "gemmini.intr.config_ex"(%280, %281) : (i64, i64) -> ()
+    %282 = llvm.mlir.constant(0 : i64) : i64
+    %283 = llvm.mlir.constant(0 : i64) : i64
+    %284 = llvm.mlir.constant(0 : i64) : i64
+    %285 = llvm.mlir.constant(0 : i64) : i64
+    %286 = llvm.mlir.constant(4503612514369537 : i64) : i64
+    %287 = llvm.mlir.constant(4296933406 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config1"(%286, %287) : (i64, i64) -> ()
+    %288 = llvm.mlir.constant(844429225164800 : i64) : i64
+    %289 = llvm.mlir.constant(281569467498512 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config2"(%288, %289) : (i64, i64) -> ()
+    %290 = llvm.mlir.constant(844437815230464 : i64) : i64
+    %291 = llvm.mlir.constant(0 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config3"(%290, %291) : (i64, i64) -> ()
+    %292 = llvm.mlir.constant(6192449487634432 : i64) : i64
+    %293 = llvm.mlir.constant(65559 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config4"(%292, %293) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config5"(%276, %272) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config6"(%274, %270) : (i64, i64) -> ()
+    %294 = llvm.mlir.constant(768 : i64) : i64
+    %295 = llvm.mlir.constant(1 : i64) : i64
+    "gemmini.intr.loop_conv_ws"(%294, %295) : (i64, i64) -> ()
+    %296 = llvm.mlir.constant(368 : i64) : i64
+    %297 = llvm.add %272, %296 : i64
+    %298 = llvm.mlir.constant(0 : i64) : i64
+    %299 = llvm.mlir.constant(0 : i64) : i64
+    %300 = llvm.mlir.constant(69 : i64) : i64
+    %301 = llvm.add %270, %300 : i64
+    %302 = llvm.mlir.constant(4503612514369537 : i64) : i64
+    %303 = llvm.mlir.constant(4296933406 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config1"(%302, %303) : (i64, i64) -> ()
+    %304 = llvm.mlir.constant(844429225164800 : i64) : i64
+    %305 = llvm.mlir.constant(281569466449936 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config2"(%304, %305) : (i64, i64) -> ()
+    %306 = llvm.mlir.constant(844437815230464 : i64) : i64
+    %307 = llvm.mlir.constant(0 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config3"(%306, %307) : (i64, i64) -> ()
+    %308 = llvm.mlir.constant(6192449487634432 : i64) : i64
+    %309 = llvm.mlir.constant(65543 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config4"(%308, %309) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config5"(%276, %297) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config6"(%274, %301) : (i64, i64) -> ()
+    %310 = llvm.mlir.constant(768 : i64) : i64
+    %311 = llvm.mlir.constant(1 : i64) : i64
+    "gemmini.intr.loop_conv_ws"(%310, %311) : (i64, i64) -> ()
+    %312 = llvm.mlir.constant(10560 : i64) : i64
+    %313 = llvm.add %272, %312 : i64
+    %314 = llvm.mlir.constant(0 : i64) : i64
+    %315 = llvm.mlir.constant(0 : i64) : i64
+    %316 = llvm.mlir.constant(2112 : i64) : i64
+    %317 = llvm.add %270, %316 : i64
+    %318 = llvm.mlir.constant(4503612514369537 : i64) : i64
+    %319 = llvm.mlir.constant(4296933406 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config1"(%318, %319) : (i64, i64) -> ()
+    %320 = llvm.mlir.constant(844429225164800 : i64) : i64
+    %321 = llvm.mlir.constant(281509337956368 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config2"(%320, %321) : (i64, i64) -> ()
+    %322 = llvm.mlir.constant(844437815230464 : i64) : i64
+    %323 = llvm.mlir.constant(0 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config3"(%322, %323) : (i64, i64) -> ()
+    %324 = llvm.mlir.constant(2251799813685248 : i64) : i64
+    %325 = llvm.mlir.constant(65559 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config4"(%324, %325) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config5"(%276, %313) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config6"(%274, %317) : (i64, i64) -> ()
+    %326 = llvm.mlir.constant(768 : i64) : i64
+    %327 = llvm.mlir.constant(1 : i64) : i64
+    "gemmini.intr.loop_conv_ws"(%326, %327) : (i64, i64) -> ()
+    %328 = llvm.mlir.constant(10928 : i64) : i64
+    %329 = llvm.add %272, %328 : i64
+    %330 = llvm.mlir.constant(0 : i64) : i64
+    %331 = llvm.mlir.constant(0 : i64) : i64
+    %332 = llvm.mlir.constant(2181 : i64) : i64
+    %333 = llvm.add %270, %332 : i64
+    %334 = llvm.mlir.constant(4503612514369537 : i64) : i64
+    %335 = llvm.mlir.constant(4296933406 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config1"(%334, %335) : (i64, i64) -> ()
+    %336 = llvm.mlir.constant(844429225164800 : i64) : i64
+    %337 = llvm.mlir.constant(281509336907792 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config2"(%336, %337) : (i64, i64) -> ()
+    %338 = llvm.mlir.constant(844437815230464 : i64) : i64
+    %339 = llvm.mlir.constant(0 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config3"(%338, %339) : (i64, i64) -> ()
+    %340 = llvm.mlir.constant(2251799813685248 : i64) : i64
+    %341 = llvm.mlir.constant(65543 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config4"(%340, %341) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config5"(%276, %329) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config6"(%274, %333) : (i64, i64) -> ()
+    %342 = llvm.mlir.constant(768 : i64) : i64
+    %343 = llvm.mlir.constant(1 : i64) : i64
+    "gemmini.intr.loop_conv_ws"(%342, %343) : (i64, i64) -> ()
+    %344 = llvm.mlir.constant(0 : i64) : i64
+    "gemmini.intr.flush"(%344, %344) : (i64, i64) -> ()
+    %345 = llvm.mlir.constant(0 : index) : i64
+    %346 = llvm.mlir.constant(1 : index) : i64
+    %347 = llvm.mlir.constant(1 : index) : i64
+    llvm.br ^bb25(%345 : i64)
+  ^bb25(%348: i64):  // 2 preds: ^bb24, ^bb35
+    %349 = llvm.icmp "slt" %348, %346 : i64
+    llvm.cond_br %349, ^bb26, ^bb36
+  ^bb26:  // pred: ^bb25
+    %350 = llvm.mlir.constant(0 : index) : i64
+    %351 = llvm.mlir.constant(16 : index) : i64
+    %352 = llvm.mlir.constant(1 : index) : i64
+    llvm.br ^bb27(%350 : i64)
+  ^bb27(%353: i64):  // 2 preds: ^bb26, ^bb34
+    %354 = llvm.icmp "slt" %353, %351 : i64
+    llvm.cond_br %354, ^bb28, ^bb35
+  ^bb28:  // pred: ^bb27
+    %355 = llvm.mlir.constant(0 : index) : i64
+    %356 = llvm.mlir.constant(30 : index) : i64
+    %357 = llvm.mlir.constant(1 : index) : i64
+    llvm.br ^bb29(%355 : i64)
+  ^bb29(%358: i64):  // 2 preds: ^bb28, ^bb33
+    %359 = llvm.icmp "slt" %358, %356 : i64
+    llvm.cond_br %359, ^bb30, ^bb34
+  ^bb30:  // pred: ^bb29
+    %360 = llvm.mlir.constant(0 : index) : i64
+    %361 = llvm.mlir.constant(30 : index) : i64
+    %362 = llvm.mlir.constant(1 : index) : i64
+    llvm.br ^bb31(%360 : i64)
+  ^bb31(%363: i64):  // 2 preds: ^bb30, ^bb32
+    %364 = llvm.icmp "slt" %363, %361 : i64
+    llvm.cond_br %364, ^bb32, ^bb33
+  ^bb32:  // pred: ^bb31
+    %365 = llvm.mlir.constant(30 : index) : i64
+    %366 = llvm.mul %348, %365 : i64
+    %367 = llvm.mul %366, %365 : i64
+    %368 = llvm.mul %358, %365 : i64
+    %369 = llvm.add %367, %368 : i64
+    %370 = llvm.add %369, %363 : i64
+    %371 = llvm.extractvalue %171[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %372 = llvm.mlir.constant(16 : index) : i64
+    %373 = llvm.mul %370, %372 : i64
+    %374 = llvm.add %373, %353 : i64
+    %375 = llvm.getelementptr %371[%374] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+    %376 = llvm.load %375 : !llvm.ptr -> f32
+    %377 = llvm.extractvalue %73[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %378 = llvm.mlir.constant(14400 : index) : i64
+    %379 = llvm.mul %348, %378 : i64
+    %380 = llvm.mlir.constant(900 : index) : i64
+    %381 = llvm.mul %353, %380 : i64
+    %382 = llvm.add %379, %381 : i64
+    %383 = llvm.mlir.constant(30 : index) : i64
+    %384 = llvm.mul %358, %383 : i64
+    %385 = llvm.add %382, %384 : i64
+    %386 = llvm.add %385, %363 : i64
+    %387 = llvm.getelementptr %377[%386] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+    llvm.store %376, %387 : f32, !llvm.ptr
+    %388 = llvm.add %363, %362 : i64
+    llvm.br ^bb31(%388 : i64)
+  ^bb33:  // pred: ^bb31
+    %389 = llvm.add %358, %357 : i64
+    llvm.br ^bb29(%389 : i64)
+  ^bb34:  // pred: ^bb29
+    %390 = llvm.add %353, %352 : i64
+    llvm.br ^bb27(%390 : i64)
+  ^bb35:  // pred: ^bb27
+    %391 = llvm.add %348, %347 : i64
+    llvm.br ^bb25(%391 : i64)
+  ^bb36:  // pred: ^bb25
+    %392 = llvm.extractvalue %124[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    llvm.call @free(%392) : (!llvm.ptr) -> ()
+    %393 = llvm.extractvalue %141[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    llvm.call @free(%393) : (!llvm.ptr) -> ()
+    %394 = llvm.extractvalue %171[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    llvm.call @free(%394) : (!llvm.ptr) -> ()
+    %395 = llvm.extractvalue %154[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
+    llvm.call @free(%395) : (!llvm.ptr) -> ()
+    %396 = llvm.mlir.constant(1 : index) : i64
+    %397 = llvm.mlir.constant(30 : index) : i64
+    %398 = llvm.mlir.constant(30 : index) : i64
+    %399 = llvm.mlir.constant(16 : index) : i64
+    %400 = llvm.mlir.constant(1 : index) : i64
+    %401 = llvm.mlir.constant(480 : index) : i64
+    %402 = llvm.mlir.constant(14400 : index) : i64
+    %403 = llvm.mlir.constant(14400 : index) : i64
+    %404 = llvm.mlir.zero : !llvm.ptr
+    %405 = llvm.getelementptr %404[%403] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+    %406 = llvm.ptrtoint %405 : !llvm.ptr to i64
+    %407 = llvm.call @malloc(%406) : (i64) -> !llvm.ptr
+    %408 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
+    %409 = llvm.insertvalue %407, %408[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %410 = llvm.insertvalue %407, %409[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %411 = llvm.mlir.constant(0 : index) : i64
+    %412 = llvm.insertvalue %411, %410[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %413 = llvm.insertvalue %396, %412[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %414 = llvm.insertvalue %397, %413[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %415 = llvm.insertvalue %398, %414[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %416 = llvm.insertvalue %399, %415[3, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %417 = llvm.insertvalue %402, %416[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %418 = llvm.insertvalue %401, %417[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %419 = llvm.insertvalue %399, %418[4, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %420 = llvm.insertvalue %400, %419[4, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %421 = llvm.mlir.constant(144 : index) : i64
+    %422 = llvm.mlir.constant(32 : index) : i64
+    %423 = llvm.mlir.constant(1 : index) : i64
+    %424 = llvm.mlir.constant(4608 : index) : i64
+    %425 = llvm.mlir.zero : !llvm.ptr
+    %426 = llvm.getelementptr %425[%424] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+    %427 = llvm.ptrtoint %426 : !llvm.ptr to i64
+    %428 = llvm.call @malloc(%427) : (i64) -> !llvm.ptr
+    %429 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+    %430 = llvm.insertvalue %428, %429[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %431 = llvm.insertvalue %428, %430[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %432 = llvm.mlir.constant(0 : index) : i64
+    %433 = llvm.insertvalue %432, %431[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %434 = llvm.insertvalue %421, %433[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %435 = llvm.insertvalue %422, %434[3, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %436 = llvm.insertvalue %422, %435[4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %437 = llvm.insertvalue %423, %436[4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %438 = llvm.mlir.constant(32 : index) : i64
+    %439 = llvm.mlir.constant(1 : index) : i64
+    %440 = llvm.mlir.zero : !llvm.ptr
+    %441 = llvm.getelementptr %440[%438] : (!llvm.ptr, i64) -> !llvm.ptr, i32
+    %442 = llvm.ptrtoint %441 : !llvm.ptr to i64
+    %443 = llvm.call @malloc(%442) : (i64) -> !llvm.ptr
+    %444 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+    %445 = llvm.insertvalue %443, %444[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
+    %446 = llvm.insertvalue %443, %445[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
+    %447 = llvm.mlir.constant(0 : index) : i64
+    %448 = llvm.insertvalue %447, %446[2] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
+    %449 = llvm.insertvalue %438, %448[3, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
+    %450 = llvm.insertvalue %439, %449[4, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
+    %451 = llvm.mlir.constant(676 : index) : i64
+    %452 = llvm.mlir.constant(32 : index) : i64
+    %453 = llvm.mlir.constant(1 : index) : i64
+    %454 = llvm.mlir.constant(21632 : index) : i64
+    %455 = llvm.mlir.zero : !llvm.ptr
+    %456 = llvm.getelementptr %455[%454] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+    %457 = llvm.ptrtoint %456 : !llvm.ptr to i64
+    %458 = llvm.call @malloc(%457) : (i64) -> !llvm.ptr
+    %459 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+    %460 = llvm.insertvalue %458, %459[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %461 = llvm.insertvalue %458, %460[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %462 = llvm.mlir.constant(0 : index) : i64
+    %463 = llvm.insertvalue %462, %461[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %464 = llvm.insertvalue %451, %463[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %465 = llvm.insertvalue %452, %464[3, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %466 = llvm.insertvalue %452, %465[4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %467 = llvm.insertvalue %453, %466[4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %468 = llvm.mlir.constant(26 : i64) : i64
+    %469 = llvm.mlir.constant(3 : index) : i64
+    %470 = llvm.mlir.constant(16 : index) : i64
+    %471 = llvm.mlir.constant(0 : index) : i64
+    %472 = llvm.mlir.constant(1 : index) : i64
+    %473 = llvm.mlir.constant(1 : index) : i64
+    llvm.br ^bb37(%471 : i64)
+  ^bb37(%474: i64):  // 2 preds: ^bb36, ^bb47
+    %475 = llvm.icmp "slt" %474, %472 : i64
+    llvm.cond_br %475, ^bb38, ^bb48
+  ^bb38:  // pred: ^bb37
+    %476 = llvm.mlir.constant(0 : index) : i64
+    %477 = llvm.mlir.constant(16 : index) : i64
+    %478 = llvm.mlir.constant(1 : index) : i64
+    llvm.br ^bb39(%476 : i64)
+  ^bb39(%479: i64):  // 2 preds: ^bb38, ^bb46
+    %480 = llvm.icmp "slt" %479, %477 : i64
+    llvm.cond_br %480, ^bb40, ^bb47
+  ^bb40:  // pred: ^bb39
+    %481 = llvm.mlir.constant(0 : index) : i64
+    %482 = llvm.mlir.constant(30 : index) : i64
+    %483 = llvm.mlir.constant(1 : index) : i64
+    llvm.br ^bb41(%481 : i64)
+  ^bb41(%484: i64):  // 2 preds: ^bb40, ^bb45
+    %485 = llvm.icmp "slt" %484, %482 : i64
+    llvm.cond_br %485, ^bb42, ^bb46
+  ^bb42:  // pred: ^bb41
+    %486 = llvm.mlir.constant(0 : index) : i64
+    %487 = llvm.mlir.constant(30 : index) : i64
+    %488 = llvm.mlir.constant(1 : index) : i64
+    llvm.br ^bb43(%486 : i64)
+  ^bb43(%489: i64):  // 2 preds: ^bb42, ^bb44
+    %490 = llvm.icmp "slt" %489, %487 : i64
+    llvm.cond_br %490, ^bb44, ^bb45
+  ^bb44:  // pred: ^bb43
+    %491 = llvm.extractvalue %73[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %492 = llvm.mlir.constant(14400 : index) : i64
+    %493 = llvm.mul %474, %492 : i64
+    %494 = llvm.mlir.constant(900 : index) : i64
+    %495 = llvm.mul %479, %494 : i64
+    %496 = llvm.add %493, %495 : i64
+    %497 = llvm.mlir.constant(30 : index) : i64
+    %498 = llvm.mul %484, %497 : i64
+    %499 = llvm.add %496, %498 : i64
+    %500 = llvm.add %499, %489 : i64
+    %501 = llvm.getelementptr %491[%500] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+    %502 = llvm.load %501 : !llvm.ptr -> f32
+    %503 = llvm.extractvalue %420[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %504 = llvm.mlir.constant(14400 : index) : i64
+    %505 = llvm.mul %474, %504 : i64
+    %506 = llvm.mlir.constant(480 : index) : i64
+    %507 = llvm.mul %484, %506 : i64
+    %508 = llvm.add %505, %507 : i64
+    %509 = llvm.mlir.constant(16 : index) : i64
+    %510 = llvm.mul %489, %509 : i64
+    %511 = llvm.add %508, %510 : i64
+    %512 = llvm.add %511, %479 : i64
+    %513 = llvm.getelementptr %503[%512] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+    llvm.store %502, %513 : f32, !llvm.ptr
+    %514 = llvm.add %489, %488 : i64
+    llvm.br ^bb43(%514 : i64)
+  ^bb45:  // pred: ^bb43
+    %515 = llvm.add %484, %483 : i64
+    llvm.br ^bb41(%515 : i64)
+  ^bb46:  // pred: ^bb41
+    %516 = llvm.add %479, %478 : i64
+    llvm.br ^bb39(%516 : i64)
+  ^bb47:  // pred: ^bb39
+    %517 = llvm.add %474, %473 : i64
+    llvm.br ^bb37(%517 : i64)
+  ^bb48:  // pred: ^bb37
+    %518 = llvm.mlir.constant(0 : index) : i64
+    %519 = llvm.mlir.constant(32 : index) : i64
+    %520 = llvm.mlir.constant(1 : index) : i64
+    llvm.br ^bb49(%518 : i64)
+  ^bb49(%521: i64):  // 2 preds: ^bb48, ^bb59
+    %522 = llvm.icmp "slt" %521, %519 : i64
+    llvm.cond_br %522, ^bb50, ^bb60
+  ^bb50:  // pred: ^bb49
+    %523 = llvm.mlir.constant(0 : index) : i64
+    %524 = llvm.mlir.constant(16 : index) : i64
+    %525 = llvm.mlir.constant(1 : index) : i64
+    llvm.br ^bb51(%523 : i64)
+  ^bb51(%526: i64):  // 2 preds: ^bb50, ^bb58
+    %527 = llvm.icmp "slt" %526, %524 : i64
+    llvm.cond_br %527, ^bb52, ^bb59
+  ^bb52:  // pred: ^bb51
+    %528 = llvm.mlir.constant(0 : index) : i64
+    %529 = llvm.mlir.constant(3 : index) : i64
+    %530 = llvm.mlir.constant(1 : index) : i64
+    llvm.br ^bb53(%528 : i64)
+  ^bb53(%531: i64):  // 2 preds: ^bb52, ^bb57
+    %532 = llvm.icmp "slt" %531, %529 : i64
+    llvm.cond_br %532, ^bb54, ^bb58
+  ^bb54:  // pred: ^bb53
+    %533 = llvm.mlir.constant(0 : index) : i64
+    %534 = llvm.mlir.constant(3 : index) : i64
+    %535 = llvm.mlir.constant(1 : index) : i64
+    llvm.br ^bb55(%533 : i64)
+  ^bb55(%536: i64):  // 2 preds: ^bb54, ^bb56
+    %537 = llvm.icmp "slt" %536, %534 : i64
+    llvm.cond_br %537, ^bb56, ^bb57
+  ^bb56:  // pred: ^bb55
+    %538 = llvm.mul %531, %469 : i64
+    %539 = llvm.mul %538, %470 : i64
+    %540 = llvm.mul %536, %470 : i64
+    %541 = llvm.add %539, %540 : i64
+    %542 = llvm.add %541, %526 : i64
+    %543 = llvm.extractvalue %24[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %544 = llvm.mlir.constant(144 : index) : i64
+    %545 = llvm.mul %521, %544 : i64
+    %546 = llvm.mlir.constant(9 : index) : i64
+    %547 = llvm.mul %526, %546 : i64
+    %548 = llvm.add %545, %547 : i64
+    %549 = llvm.mlir.constant(3 : index) : i64
+    %550 = llvm.mul %531, %549 : i64
+    %551 = llvm.add %548, %550 : i64
+    %552 = llvm.add %551, %536 : i64
+    %553 = llvm.getelementptr %543[%552] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+    %554 = llvm.load %553 : !llvm.ptr -> f32
+    %555 = llvm.extractvalue %437[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %556 = llvm.mlir.constant(32 : index) : i64
+    %557 = llvm.mul %542, %556 : i64
+    %558 = llvm.add %557, %521 : i64
+    %559 = llvm.getelementptr %555[%558] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+    llvm.store %554, %559 : f32, !llvm.ptr
+    %560 = llvm.add %536, %535 : i64
+    llvm.br ^bb55(%560 : i64)
+  ^bb57:  // pred: ^bb55
+    %561 = llvm.add %531, %530 : i64
+    llvm.br ^bb53(%561 : i64)
+  ^bb58:  // pred: ^bb53
+    %562 = llvm.add %526, %525 : i64
+    llvm.br ^bb51(%562 : i64)
+  ^bb59:  // pred: ^bb51
+    %563 = llvm.add %521, %520 : i64
+    llvm.br ^bb49(%563 : i64)
+  ^bb60:  // pred: ^bb49
+    %564 = llvm.mlir.constant(3 : i64) : i64
+    %565 = llvm.extractvalue %420[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %566 = llvm.ptrtoint %565 : !llvm.ptr to i64
+    %567 = llvm.extractvalue %467[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %568 = llvm.ptrtoint %567 : !llvm.ptr to i64
+    %569 = llvm.extractvalue %450[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
+    %570 = llvm.ptrtoint %569 : !llvm.ptr to i64
+    %571 = llvm.extractvalue %437[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %572 = llvm.ptrtoint %571 : !llvm.ptr to i64
+    %573 = llvm.mlir.constant(32 : i64) : i64
+    %574 = llvm.mlir.constant(2 : i64) : i64
+    %575 = llvm.mlir.constant(4575657221408423968 : i64) : i64
+    "gemmini.intr.config_st"(%574, %575) : (i64, i64) -> ()
+    %576 = llvm.mlir.constant(65540 : i64) : i64
+    %577 = llvm.mlir.constant(281474976710656 : i64) : i64
+    "gemmini.intr.config_ex"(%576, %577) : (i64, i64) -> ()
+    %578 = llvm.mlir.constant(0 : i64) : i64
+    %579 = llvm.mlir.constant(0 : i64) : i64
+    %580 = llvm.mlir.constant(0 : i64) : i64
+    %581 = llvm.mlir.constant(0 : i64) : i64
+    %582 = llvm.mlir.constant(9007267976183809 : i64) : i64
+    %583 = llvm.mlir.constant(4296671258 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config1"(%582, %583) : (i64, i64) -> ()
+    %584 = llvm.mlir.constant(844429225164800 : i64) : i64
+    %585 = llvm.mlir.constant(281569467498512 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config2"(%584, %585) : (i64, i64) -> ()
+    %586 = llvm.mlir.constant(844437816082432 : i64) : i64
+    %587 = llvm.mlir.constant(0 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config3"(%586, %587) : (i64, i64) -> ()
+    %588 = llvm.mlir.constant(6192449487634432 : i64) : i64
+    %589 = llvm.mlir.constant(65559 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config4"(%588, %589) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config5"(%572, %568) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config6"(%570, %566) : (i64, i64) -> ()
+    %590 = llvm.mlir.constant(256 : i64) : i64
+    %591 = llvm.mlir.constant(1 : i64) : i64
+    "gemmini.intr.loop_conv_ws"(%590, %591) : (i64, i64) -> ()
+    %592 = llvm.mlir.constant(16 : i64) : i64
+    %593 = llvm.add %568, %592 : i64
+    %594 = llvm.mlir.constant(64 : i64) : i64
+    %595 = llvm.add %570, %594 : i64
+    %596 = llvm.mlir.constant(16 : i64) : i64
+    %597 = llvm.add %572, %596 : i64
+    %598 = llvm.mlir.constant(0 : i64) : i64
+    %599 = llvm.mlir.constant(9007267976183809 : i64) : i64
+    %600 = llvm.mlir.constant(4296671258 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config1"(%599, %600) : (i64, i64) -> ()
+    %601 = llvm.mlir.constant(844429225164800 : i64) : i64
+    %602 = llvm.mlir.constant(281569467498512 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config2"(%601, %602) : (i64, i64) -> ()
+    %603 = llvm.mlir.constant(844437816082432 : i64) : i64
+    %604 = llvm.mlir.constant(0 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config3"(%603, %604) : (i64, i64) -> ()
+    %605 = llvm.mlir.constant(6192449487634432 : i64) : i64
+    %606 = llvm.mlir.constant(65559 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config4"(%605, %606) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config5"(%597, %593) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config6"(%595, %566) : (i64, i64) -> ()
+    %607 = llvm.mlir.constant(256 : i64) : i64
+    %608 = llvm.mlir.constant(1 : i64) : i64
+    "gemmini.intr.loop_conv_ws"(%607, %608) : (i64, i64) -> ()
+    %609 = llvm.mlir.constant(736 : i64) : i64
+    %610 = llvm.add %568, %609 : i64
+    %611 = llvm.mlir.constant(0 : i64) : i64
+    %612 = llvm.mlir.constant(0 : i64) : i64
+    %613 = llvm.mlir.constant(368 : i64) : i64
+    %614 = llvm.add %566, %613 : i64
+    %615 = llvm.mlir.constant(9007267976183809 : i64) : i64
+    %616 = llvm.mlir.constant(4296671258 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config1"(%615, %616) : (i64, i64) -> ()
+    %617 = llvm.mlir.constant(844429225164800 : i64) : i64
+    %618 = llvm.mlir.constant(281569466187792 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config2"(%617, %618) : (i64, i64) -> ()
+    %619 = llvm.mlir.constant(844437816082432 : i64) : i64
+    %620 = llvm.mlir.constant(0 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config3"(%619, %620) : (i64, i64) -> ()
+    %621 = llvm.mlir.constant(6192449487634432 : i64) : i64
+    %622 = llvm.mlir.constant(65539 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config4"(%621, %622) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config5"(%572, %610) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config6"(%570, %614) : (i64, i64) -> ()
+    %623 = llvm.mlir.constant(256 : i64) : i64
+    %624 = llvm.mlir.constant(1 : i64) : i64
+    "gemmini.intr.loop_conv_ws"(%623, %624) : (i64, i64) -> ()
+    %625 = llvm.mlir.constant(752 : i64) : i64
+    %626 = llvm.add %568, %625 : i64
+    %627 = llvm.mlir.constant(64 : i64) : i64
+    %628 = llvm.add %570, %627 : i64
+    %629 = llvm.mlir.constant(16 : i64) : i64
+    %630 = llvm.add %572, %629 : i64
+    %631 = llvm.mlir.constant(368 : i64) : i64
+    %632 = llvm.add %566, %631 : i64
+    %633 = llvm.mlir.constant(9007267976183809 : i64) : i64
+    %634 = llvm.mlir.constant(4296671258 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config1"(%633, %634) : (i64, i64) -> ()
+    %635 = llvm.mlir.constant(844429225164800 : i64) : i64
+    %636 = llvm.mlir.constant(281569466187792 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config2"(%635, %636) : (i64, i64) -> ()
+    %637 = llvm.mlir.constant(844437816082432 : i64) : i64
+    %638 = llvm.mlir.constant(0 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config3"(%637, %638) : (i64, i64) -> ()
+    %639 = llvm.mlir.constant(6192449487634432 : i64) : i64
+    %640 = llvm.mlir.constant(65539 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config4"(%639, %640) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config5"(%630, %626) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config6"(%628, %632) : (i64, i64) -> ()
+    %641 = llvm.mlir.constant(256 : i64) : i64
+    %642 = llvm.mlir.constant(1 : i64) : i64
+    "gemmini.intr.loop_conv_ws"(%641, %642) : (i64, i64) -> ()
+    %643 = llvm.mlir.constant(18304 : i64) : i64
+    %644 = llvm.add %568, %643 : i64
+    %645 = llvm.mlir.constant(0 : i64) : i64
+    %646 = llvm.mlir.constant(0 : i64) : i64
+    %647 = llvm.mlir.constant(10560 : i64) : i64
+    %648 = llvm.add %566, %647 : i64
+    %649 = llvm.mlir.constant(9007267976183809 : i64) : i64
+    %650 = llvm.mlir.constant(4296671258 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config1"(%649, %650) : (i64, i64) -> ()
+    %651 = llvm.mlir.constant(844429225164800 : i64) : i64
+    %652 = llvm.mlir.constant(281492158087184 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config2"(%651, %652) : (i64, i64) -> ()
+    %653 = llvm.mlir.constant(844437816082432 : i64) : i64
+    %654 = llvm.mlir.constant(0 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config3"(%653, %654) : (i64, i64) -> ()
+    %655 = llvm.mlir.constant(1125899906842624 : i64) : i64
+    %656 = llvm.mlir.constant(65559 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config4"(%655, %656) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config5"(%572, %644) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config6"(%570, %648) : (i64, i64) -> ()
+    %657 = llvm.mlir.constant(256 : i64) : i64
+    %658 = llvm.mlir.constant(1 : i64) : i64
+    "gemmini.intr.loop_conv_ws"(%657, %658) : (i64, i64) -> ()
+    %659 = llvm.mlir.constant(18320 : i64) : i64
+    %660 = llvm.add %568, %659 : i64
+    %661 = llvm.mlir.constant(64 : i64) : i64
+    %662 = llvm.add %570, %661 : i64
+    %663 = llvm.mlir.constant(16 : i64) : i64
+    %664 = llvm.add %572, %663 : i64
+    %665 = llvm.mlir.constant(10560 : i64) : i64
+    %666 = llvm.add %566, %665 : i64
+    %667 = llvm.mlir.constant(9007267976183809 : i64) : i64
+    %668 = llvm.mlir.constant(4296671258 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config1"(%667, %668) : (i64, i64) -> ()
+    %669 = llvm.mlir.constant(844429225164800 : i64) : i64
+    %670 = llvm.mlir.constant(281492158087184 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config2"(%669, %670) : (i64, i64) -> ()
+    %671 = llvm.mlir.constant(844437816082432 : i64) : i64
+    %672 = llvm.mlir.constant(0 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config3"(%671, %672) : (i64, i64) -> ()
+    %673 = llvm.mlir.constant(1125899906842624 : i64) : i64
+    %674 = llvm.mlir.constant(65559 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config4"(%673, %674) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config5"(%664, %660) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config6"(%662, %666) : (i64, i64) -> ()
+    %675 = llvm.mlir.constant(256 : i64) : i64
+    %676 = llvm.mlir.constant(1 : i64) : i64
+    "gemmini.intr.loop_conv_ws"(%675, %676) : (i64, i64) -> ()
+    %677 = llvm.mlir.constant(19040 : i64) : i64
+    %678 = llvm.add %568, %677 : i64
+    %679 = llvm.mlir.constant(0 : i64) : i64
+    %680 = llvm.mlir.constant(0 : i64) : i64
+    %681 = llvm.mlir.constant(10928 : i64) : i64
+    %682 = llvm.add %566, %681 : i64
+    %683 = llvm.mlir.constant(9007267976183809 : i64) : i64
+    %684 = llvm.mlir.constant(4296671258 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config1"(%683, %684) : (i64, i64) -> ()
+    %685 = llvm.mlir.constant(844429225164800 : i64) : i64
+    %686 = llvm.mlir.constant(281492156776464 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config2"(%685, %686) : (i64, i64) -> ()
+    %687 = llvm.mlir.constant(844437816082432 : i64) : i64
+    %688 = llvm.mlir.constant(0 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config3"(%687, %688) : (i64, i64) -> ()
+    %689 = llvm.mlir.constant(1125899906842624 : i64) : i64
+    %690 = llvm.mlir.constant(65539 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config4"(%689, %690) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config5"(%572, %678) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config6"(%570, %682) : (i64, i64) -> ()
+    %691 = llvm.mlir.constant(256 : i64) : i64
+    %692 = llvm.mlir.constant(1 : i64) : i64
+    "gemmini.intr.loop_conv_ws"(%691, %692) : (i64, i64) -> ()
+    %693 = llvm.mlir.constant(19056 : i64) : i64
+    %694 = llvm.add %568, %693 : i64
+    %695 = llvm.mlir.constant(64 : i64) : i64
+    %696 = llvm.add %570, %695 : i64
+    %697 = llvm.mlir.constant(16 : i64) : i64
+    %698 = llvm.add %572, %697 : i64
+    %699 = llvm.mlir.constant(10928 : i64) : i64
+    %700 = llvm.add %566, %699 : i64
+    %701 = llvm.mlir.constant(9007267976183809 : i64) : i64
+    %702 = llvm.mlir.constant(4296671258 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config1"(%701, %702) : (i64, i64) -> ()
+    %703 = llvm.mlir.constant(844429225164800 : i64) : i64
+    %704 = llvm.mlir.constant(281492156776464 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config2"(%703, %704) : (i64, i64) -> ()
+    %705 = llvm.mlir.constant(844437816082432 : i64) : i64
+    %706 = llvm.mlir.constant(0 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config3"(%705, %706) : (i64, i64) -> ()
+    %707 = llvm.mlir.constant(1125899906842624 : i64) : i64
+    %708 = llvm.mlir.constant(65539 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config4"(%707, %708) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config5"(%698, %694) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config6"(%696, %700) : (i64, i64) -> ()
+    %709 = llvm.mlir.constant(256 : i64) : i64
+    %710 = llvm.mlir.constant(1 : i64) : i64
+    "gemmini.intr.loop_conv_ws"(%709, %710) : (i64, i64) -> ()
+    %711 = llvm.mlir.constant(0 : i64) : i64
+    "gemmini.intr.flush"(%711, %711) : (i64, i64) -> ()
+    %712 = llvm.mlir.constant(0 : index) : i64
+    %713 = llvm.mlir.constant(1 : index) : i64
+    %714 = llvm.mlir.constant(1 : index) : i64
+    llvm.br ^bb61(%712 : i64)
+  ^bb61(%715: i64):  // 2 preds: ^bb60, ^bb71
+    %716 = llvm.icmp "slt" %715, %713 : i64
+    llvm.cond_br %716, ^bb62, ^bb72
+  ^bb62:  // pred: ^bb61
+    %717 = llvm.mlir.constant(0 : index) : i64
+    %718 = llvm.mlir.constant(32 : index) : i64
+    %719 = llvm.mlir.constant(1 : index) : i64
+    llvm.br ^bb63(%717 : i64)
+  ^bb63(%720: i64):  // 2 preds: ^bb62, ^bb70
+    %721 = llvm.icmp "slt" %720, %718 : i64
+    llvm.cond_br %721, ^bb64, ^bb71
+  ^bb64:  // pred: ^bb63
+    %722 = llvm.mlir.constant(0 : index) : i64
+    %723 = llvm.mlir.constant(26 : index) : i64
+    %724 = llvm.mlir.constant(1 : index) : i64
+    llvm.br ^bb65(%722 : i64)
+  ^bb65(%725: i64):  // 2 preds: ^bb64, ^bb69
+    %726 = llvm.icmp "slt" %725, %723 : i64
+    llvm.cond_br %726, ^bb66, ^bb70
+  ^bb66:  // pred: ^bb65
+    %727 = llvm.mlir.constant(0 : index) : i64
+    %728 = llvm.mlir.constant(26 : index) : i64
+    %729 = llvm.mlir.constant(1 : index) : i64
+    llvm.br ^bb67(%727 : i64)
+  ^bb67(%730: i64):  // 2 preds: ^bb66, ^bb68
+    %731 = llvm.icmp "slt" %730, %728 : i64
+    llvm.cond_br %731, ^bb68, ^bb69
+  ^bb68:  // pred: ^bb67
+    %732 = llvm.mlir.constant(26 : index) : i64
+    %733 = llvm.mul %715, %732 : i64
+    %734 = llvm.mul %733, %732 : i64
+    %735 = llvm.mul %725, %732 : i64
+    %736 = llvm.add %734, %735 : i64
+    %737 = llvm.add %736, %730 : i64
+    %738 = llvm.extractvalue %467[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %739 = llvm.mlir.constant(32 : index) : i64
+    %740 = llvm.mul %737, %739 : i64
+    %741 = llvm.add %740, %720 : i64
+    %742 = llvm.getelementptr %738[%741] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+    %743 = llvm.load %742 : !llvm.ptr -> f32
+    %744 = llvm.extractvalue %98[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %745 = llvm.mlir.constant(21632 : index) : i64
+    %746 = llvm.mul %715, %745 : i64
+    %747 = llvm.mlir.constant(676 : index) : i64
+    %748 = llvm.mul %720, %747 : i64
+    %749 = llvm.add %746, %748 : i64
+    %750 = llvm.mlir.constant(26 : index) : i64
+    %751 = llvm.mul %725, %750 : i64
+    %752 = llvm.add %749, %751 : i64
+    %753 = llvm.add %752, %730 : i64
+    %754 = llvm.getelementptr %744[%753] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+    llvm.store %743, %754 : f32, !llvm.ptr
+    %755 = llvm.add %730, %729 : i64
+    llvm.br ^bb67(%755 : i64)
+  ^bb69:  // pred: ^bb67
+    %756 = llvm.add %725, %724 : i64
+    llvm.br ^bb65(%756 : i64)
+  ^bb70:  // pred: ^bb65
+    %757 = llvm.add %720, %719 : i64
+    llvm.br ^bb63(%757 : i64)
+  ^bb71:  // pred: ^bb63
+    %758 = llvm.add %715, %714 : i64
+    llvm.br ^bb61(%758 : i64)
+  ^bb72:  // pred: ^bb61
+    %759 = llvm.extractvalue %420[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    llvm.call @free(%759) : (!llvm.ptr) -> ()
+    %760 = llvm.extractvalue %437[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    llvm.call @free(%760) : (!llvm.ptr) -> ()
+    %761 = llvm.extractvalue %467[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    llvm.call @free(%761) : (!llvm.ptr) -> ()
+    %762 = llvm.extractvalue %450[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
+    llvm.call @free(%762) : (!llvm.ptr) -> ()
+    linalg.copy ins(%99 : memref<1x32x26x26xf32>) outs(%12 : memref<1x32x26x26xf32>)
+    %763 = llvm.extractvalue %73[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    llvm.call @free(%763) : (!llvm.ptr) -> ()
+    %764 = llvm.extractvalue %98[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    llvm.call @free(%764) : (!llvm.ptr) -> ()
+    llvm.return
+  }
+}
+
+
diff --git a/experiments/gemmini/networks/mini_cnn_block.mlir b/experiments/gemmini/networks/mini_cnn_block.mlir
new file mode 100644
index 0000000..6ae76a9
--- /dev/null
+++ b/experiments/gemmini/networks/mini_cnn_block.mlir
@@ -0,0 +1,34 @@
+module {
+  func.func @mini_cnn_block(
+      %input: memref<1x3x32x32xf32>,      // NCHW input
+      %w1:    memref<16x3x3x3xf32>,       // conv1 weights
+      %w2:    memref<32x16x3x3xf32>,      // conv2 weights
+      %out:   memref<1x32x26x26xf32>      // final output after conv2
+  ) {
+    %conv1 = memref.alloc() : memref<1x16x30x30xf32>
+    %conv2 = memref.alloc() : memref<1x32x26x26xf32>
+
+    // Conv 1: 3x3, stride 1, NCHW x FCHW
+    linalg.conv_2d_nchw_fchw
+      ins(%input, %w1
+          : memref<1x3x32x32xf32>, memref<16x3x3x3xf32>)
+      outs(%conv1
+          : memref<1x16x30x30xf32>)
+
+    // Conv 2: 3x3, stride 1, NCHW x FCHW
+    linalg.conv_2d_nchw_fchw
+      ins(%conv1, %w2
+          : memref<1x16x30x30xf32>, memref<32x16x3x3xf32>)
+      outs(%conv2
+          : memref<1x32x26x26xf32>)
+
+    // Just copy conv2 -> out for now (no FC yet)
+    linalg.copy
+      ins(%conv2 : memref<1x32x26x26xf32>)
+      outs(%out  : memref<1x32x26x26xf32>)
+
+    memref.dealloc %conv1 : memref<1x16x30x30xf32>
+    memref.dealloc %conv2 : memref<1x32x26x26xf32>
+    return
+  }
+}

From 4f19571451d05eda7050f3cbbf46860d366787df Mon Sep 17 00:00:00 2001
From: sparsh <sparshsingh@berkeley.edu>
Date: Fri, 16 Jan 2026 19:05:57 -0800
Subject: [PATCH 09/13] Docs: add Gemmini lowering coverage table

---
 experiments/gemmini/SUPPORT.md | 11 +++++++++++
 1 file changed, 11 insertions(+)
 create mode 100644 experiments/gemmini/SUPPORT.md

diff --git a/experiments/gemmini/SUPPORT.md b/experiments/gemmini/SUPPORT.md
new file mode 100644
index 0000000..06d455a
--- /dev/null
+++ b/experiments/gemmini/SUPPORT.md
@@ -0,0 +1,11 @@
+# Buddy Gemmini lowering coverage (Sparsh)
+
+This table is a quick view of what we’ve stress-tested and what Buddy lowers into.
+
+| Test | Input dialect/op | Layout | Proof of Gemmini match | Proof of Gemmini command expansion | Notes |
+|---|---|---|---|---|---|
+| matmul | linalg.matmul | (varies) | `gemmini.tile_matmul` | `gemmini.intr.loop_ws_config*` + `gemmini.intr.loop_ws` | matmul lowered end-to-end |
+| batch_matmul | linalg.batch_matmul | (varies) | `gemmini.tile_*` | `gemmini.intr.*` | batched path works |
+| conv (NHWC/HWCF) | linalg.conv_2d_nhwc_hwcf | NHWC x HWCF | `gemmini.tile_conv` | `gemmini.intr.loop_conv_ws_config*` + `gemmini.intr.loop_conv_ws` | conv lowered to WS loop |
+| conv (NCHW/FCHW) | linalg.conv_2d_nchw_fchw | NCHW x FCHW | `gemmini.tile_conv` | `gemmini.intr.loop_conv_ws_config*` + `gemmini.intr.loop_conv_ws` | alternate layout works |
+| mini CNN block | 2x conv + copy | NCHW/FCHW | 2x `gemmini.tile_conv` | `gemmini.intr.loop_conv_ws*` appears | multi-layer block lowers |

From a81f917524c3d75011c6ede586efc26fe7f1e7ea Mon Sep 17 00:00:00 2001
From: sparsh <sparshsingh@berkeley.edu>
Date: Fri, 16 Jan 2026 20:35:43 -0800
Subject: [PATCH 10/13] IREE: add iree-compile IR dump (print-after-all) for
 baseline pipeline

---
 .../iree/logs/iree.print-after-all.mlir       | 27771 ++++++++++++++++
 1 file changed, 27771 insertions(+)
 create mode 100644 experiments/iree/logs/iree.print-after-all.mlir

diff --git a/experiments/iree/logs/iree.print-after-all.mlir b/experiments/iree/logs/iree.print-after-all.mlir
new file mode 100644
index 0000000..509d405
--- /dev/null
+++ b/experiments/iree/logs/iree.print-after-all.mlir
@@ -0,0 +1,27771 @@
+// -----// IR Dump After AutoInputConversionPipelinePass (iree-auto-input-conversion) //----- //
+module {
+  func.func @multiple_results(%arg0: tensor<2xf32>, %arg1: tensor<2xf32>) -> (tensor<2xf32>, tensor<2xf32>) {
+    %0 = math.absf %arg0 : tensor<2xf32>
+    %1 = math.absf %arg1 : tensor<2xf32>
+    return %0, %1 : tensor<2xf32>, tensor<2xf32>
+  }
+}
+
+
+// -----// IR Dump After IREEImportPublicPass (iree-import-public) //----- //
+module {
+  util.func public @multiple_results(%arg0: tensor<2xf32>, %arg1: tensor<2xf32>) -> (tensor<2xf32>, tensor<2xf32>) {
+    %0 = math.absf %arg0 : tensor<2xf32>
+    %1 = math.absf %arg1 : tensor<2xf32>
+    util.return %0, %1 : tensor<2xf32>, tensor<2xf32>
+  }
+}
+
+
+// -----// IR Dump After ImportMLProgramPass (iree-import-ml-program) //----- //
+module {
+  util.func public @multiple_results(%arg0: tensor<2xf32>, %arg1: tensor<2xf32>) -> (tensor<2xf32>, tensor<2xf32>) {
+    %0 = math.absf %arg0 : tensor<2xf32>
+    %1 = math.absf %arg1 : tensor<2xf32>
+    util.return %0, %1 : tensor<2xf32>, tensor<2xf32>
+  }
+}
+
+
+// -----// IR Dump After SanitizeModuleNamesPass (iree-sanitize-module-names) //----- //
+module {
+  util.func public @multiple_results(%arg0: tensor<2xf32>, %arg1: tensor<2xf32>) -> (tensor<2xf32>, tensor<2xf32>) {
+    %0 = math.absf %arg0 : tensor<2xf32>
+    %1 = math.absf %arg1 : tensor<2xf32>
+    util.return %0, %1 : tensor<2xf32>, tensor<2xf32>
+  }
+}
+
+
+// -----// IR Dump After ConvertShardToFlowPass (iree-convert-shard-to-flow) //----- //
+module {
+  util.func public @multiple_results(%arg0: tensor<2xf32>, %arg1: tensor<2xf32>) -> (tensor<2xf32>, tensor<2xf32>) {
+    %0 = math.absf %arg0 : tensor<2xf32>
+    %1 = math.absf %arg1 : tensor<2xf32>
+    util.return %0, %1 : tensor<2xf32>, tensor<2xf32>
+  }
+}
+
+
+// -----// IR Dump After DemoteF64ToF32Pass (iree-input-conversion-demote-f64-to-f32) //----- //
+module {
+  util.func public @multiple_results(%arg0: tensor<2xf32>, %arg1: tensor<2xf32>) -> (tensor<2xf32>, tensor<2xf32>) {
+    %0 = math.absf %arg0 : tensor<2xf32>
+    %1 = math.absf %arg1 : tensor<2xf32>
+    util.return %0, %1 : tensor<2xf32>, tensor<2xf32>
+  }
+}
+
+
+// -----// IR Dump After ConvertStreamableOpsPass (iree-abi-convert-streamable-ops) //----- //
+module {
+  util.func public @multiple_results(%arg0: tensor<2xf32>, %arg1: tensor<2xf32>) -> (tensor<2xf32>, tensor<2xf32>) {
+    %0 = math.absf %arg0 : tensor<2xf32>
+    %1 = math.absf %arg1 : tensor<2xf32>
+    util.return %0, %1 : tensor<2xf32>, tensor<2xf32>
+  }
+}
+
+
+// -----// IR Dump After WrapEntryPointsPass (iree-abi-wrap-entry-points) //----- //
+module {
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+    %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+    %2:2 = util.call @_multiple_results(%0, %1) : (tensor<2xf32>, tensor<2xf32>) -> (tensor<2xf32>, tensor<2xf32>)
+    %3 = hal.tensor.export %2#0 "output0" : tensor<2xf32> -> !hal.buffer_view
+    %4 = hal.tensor.export %2#1 "output1" : tensor<2xf32> -> !hal.buffer_view
+    util.return %3, %4 : !hal.buffer_view, !hal.buffer_view
+  }
+  util.func private @_multiple_results(%arg0: tensor<2xf32>, %arg1: tensor<2xf32>) -> (tensor<2xf32>, tensor<2xf32>) attributes {hal.abi.convention = #hal.abi.convention<synchronous>} {
+    %0 = math.absf %arg0 : tensor<2xf32>
+    %1 = math.absf %arg1 : tensor<2xf32>
+    util.return %0, %1 : tensor<2xf32>, tensor<2xf32>
+  }
+}
+
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+util.func private @_multiple_results(%arg0: tensor<2xf32>, %arg1: tensor<2xf32>) -> (tensor<2xf32>, tensor<2xf32>) attributes {hal.abi.convention = #hal.abi.convention<synchronous>} {
+  %0 = math.absf %arg0 : tensor<2xf32>
+  %1 = math.absf %arg1 : tensor<2xf32>
+  util.return %0, %1 : tensor<2xf32>, tensor<2xf32>
+}
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %2:2 = util.call @_multiple_results(%0, %1) : (tensor<2xf32>, tensor<2xf32>) -> (tensor<2xf32>, tensor<2xf32>)
+  %3 = hal.tensor.export %2#0 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %4 = hal.tensor.export %2#1 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %3, %4 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %2 = math.absf %0 : tensor<2xf32>
+  %3 = math.absf %1 : tensor<2xf32>
+  %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %4, %5 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After Inliner (inline) //----- //
+module {
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+    %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+    %2 = math.absf %0 : tensor<2xf32>
+    %3 = math.absf %1 : tensor<2xf32>
+    %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view
+    %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view
+    util.return %4, %5 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %2 = math.absf %0 : tensor<2xf32>
+  %3 = math.absf %1 : tensor<2xf32>
+  %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %4, %5 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After CSE (cse) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %2 = math.absf %0 : tensor<2xf32>
+  %3 = math.absf %1 : tensor<2xf32>
+  %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %4, %5 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After SymbolDCE (symbol-dce) //----- //
+module {
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+    %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+    %2 = math.absf %0 : tensor<2xf32>
+    %3 = math.absf %1 : tensor<2xf32>
+    %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view
+    %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view
+    util.return %4, %5 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After AssignLegacyTargetDevicesPass (iree-hal-assign-legacy-target-devices) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {hal.device.targets = [#device_target_local]} {
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+    %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+    %2 = math.absf %0 : tensor<2xf32>
+    %3 = math.absf %1 : tensor<2xf32>
+    %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view
+    %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view
+    util.return %4, %5 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After MaterializeTargetDevicesPass (iree-hal-materialize-target-devices) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+    %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+    %2 = math.absf %0 : tensor<2xf32>
+    %3 = math.absf %1 : tensor<2xf32>
+    %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view
+    %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view
+    util.return %4, %5 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After ResolveDevicePromisesPass (iree-hal-resolve-device-promises) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+    %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+    %2 = math.absf %0 : tensor<2xf32>
+    %3 = math.absf %1 : tensor<2xf32>
+    %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view
+    %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view
+    util.return %4, %5 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After ResolveDeviceAliasesPass (iree-hal-resolve-device-aliases) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+    %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+    %2 = math.absf %0 : tensor<2xf32>
+    %3 = math.absf %1 : tensor<2xf32>
+    %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view
+    %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view
+    util.return %4, %5 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After VerifyDevicesPass (iree-hal-verify-devices) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+    %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+    %2 = math.absf %0 : tensor<2xf32>
+    %3 = math.absf %1 : tensor<2xf32>
+    %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view
+    %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view
+    util.return %4, %5 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After AttrBasedPipelinePass (iree-preprocessing-attr-based-pipeline) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %2 = math.absf %0 : tensor<2xf32>
+  %3 = math.absf %1 : tensor<2xf32>
+  %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %4, %5 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After WarnOnUninitializedValuesPass (iree-global-opt-warn-on-uninitialized-values) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %2 = math.absf %0 : tensor<2xf32>
+  %3 = math.absf %1 : tensor<2xf32>
+  %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %4, %5 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %2 = math.absf %0 : tensor<2xf32>
+  %3 = math.absf %1 : tensor<2xf32>
+  %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %4, %5 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After LinalgQuantizedConvToConvPass (iree-global-opt-quantized-conv-to-conv) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %2 = math.absf %0 : tensor<2xf32>
+  %3 = math.absf %1 : tensor<2xf32>
+  %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %4, %5 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After LinalgQuantizedMatmulToMatmulPass (iree-global-opt-quantized-matmul-to-matmul) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %2 = math.absf %0 : tensor<2xf32>
+  %3 = math.absf %1 : tensor<2xf32>
+  %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %4, %5 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After CanonicalizePass (iree-flow-canonicalize) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %2 = math.absf %0 : tensor<2xf32>
+  %3 = math.absf %1 : tensor<2xf32>
+  %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %4, %5 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After RemoveZeroExtentTensorsPass (iree-global-opt-remove-zero-extent-tensors) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %2 = math.absf %0 : tensor<2xf32>
+  %3 = math.absf %1 : tensor<2xf32>
+  %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %4, %5 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After DetachElementwiseFromNamedOpsPass (iree-global-opt-detach-elementwise-from-named-ops) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %2 = math.absf %0 : tensor<2xf32>
+  %3 = math.absf %1 : tensor<2xf32>
+  %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %4, %5 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After SimplifyDepthwiseConvPass (simplify-depthwise-conv) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %2 = math.absf %0 : tensor<2xf32>
+  %3 = math.absf %1 : tensor<2xf32>
+  %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %4, %5 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After EraseUnusedLinalgOperandsPass (iree-global-opt-erase-unused-linalg-operands) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+    %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+    %2 = math.absf %0 : tensor<2xf32>
+    %3 = math.absf %1 : tensor<2xf32>
+    %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view
+    %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view
+    util.return %4, %5 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After ExpandTensorShapesPass (iree-global-opt-expand-tensor-shapes) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+    %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+    %2 = math.absf %0 : tensor<2xf32>
+    %3 = math.absf %1 : tensor<2xf32>
+    %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view
+    %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view
+    util.return %4, %5 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After ConvertElementwiseToLinalgPass (convert-elementwise-to-linalg) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<2xf32>) outs(%0 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %6 = math.absf %in : f32
+    linalg.yield %6 : f32
+  } -> tensor<2xf32>
+  %3 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %6 = math.absf %in : f32
+    linalg.yield %6 : f32
+  } -> tensor<2xf32>
+  %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %4, %5 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After RaiseSpecialOpsPass (iree-global-opt-raise-special-ops) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<2xf32>) outs(%0 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %6 = math.absf %in : f32
+    linalg.yield %6 : f32
+  } -> tensor<2xf32>
+  %3 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %6 = math.absf %in : f32
+    linalg.yield %6 : f32
+  } -> tensor<2xf32>
+  %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %4, %5 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After DecomposeConcatPass (iree-global-opt-decompose-concat) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<2xf32>) outs(%0 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %6 = math.absf %in : f32
+    linalg.yield %6 : f32
+  } -> tensor<2xf32>
+  %3 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %6 = math.absf %in : f32
+    linalg.yield %6 : f32
+  } -> tensor<2xf32>
+  %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %4, %5 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After GeneralizeLinalgNamedOpsPass (iree-global-opt-generalize-linalg-named-ops) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<2xf32>) outs(%0 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %6 = math.absf %in : f32
+    linalg.yield %6 : f32
+  } -> tensor<2xf32>
+  %3 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %6 = math.absf %in : f32
+    linalg.yield %6 : f32
+  } -> tensor<2xf32>
+  %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %4, %5 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After InsertTensorBarriersPass (iree-dispatch-creation-insert-tensor-barriers) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32>
+  %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32>
+  %4 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %10 = math.absf %in : f32
+    linalg.yield %10 : f32
+  } -> tensor<2xf32>
+  %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32>
+  %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %10 = math.absf %in : f32
+    linalg.yield %10 : f32
+  } -> tensor<2xf32>
+  %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32>
+  %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %8, %9 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After FoldUnitExtentDimsPass (iree-dispatch-creation-fold-unit-extent-dims) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+    %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32>
+    %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+    %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32>
+    %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %10 = math.absf %in : f32
+      linalg.yield %10 : f32
+    } -> tensor<2xf32>
+    %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32>
+    %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %10 = math.absf %in : f32
+      linalg.yield %10 : f32
+    } -> tensor<2xf32>
+    %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32>
+    %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view
+    %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view
+    util.return %8, %9 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After DemoteContractionInputsToBF16Pass (iree-global-opt-demote-contraction-inputs-to-bf16) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32>
+  %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32>
+  %4 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %10 = math.absf %in : f32
+    linalg.yield %10 : f32
+  } -> tensor<2xf32>
+  %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32>
+  %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %10 = math.absf %in : f32
+    linalg.yield %10 : f32
+  } -> tensor<2xf32>
+  %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32>
+  %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %8, %9 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After CanonicalizePass (iree-flow-canonicalize) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32>
+  %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32>
+  %4 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %10 = math.absf %in : f32
+    linalg.yield %10 : f32
+  } -> tensor<2xf32>
+  %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32>
+  %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %10 = math.absf %in : f32
+    linalg.yield %10 : f32
+  } -> tensor<2xf32>
+  %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32>
+  %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %8, %9 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After CSE (cse) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32>
+  %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32>
+  %4 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %10 = math.absf %in : f32
+    linalg.yield %10 : f32
+  } -> tensor<2xf32>
+  %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32>
+  %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %10 = math.absf %in : f32
+    linalg.yield %10 : f32
+  } -> tensor<2xf32>
+  %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32>
+  %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %8, %9 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After PropagateLinalgTransposePass (iree-global-opt-propagate-linalg-transpose) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32>
+  %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32>
+  %4 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %10 = math.absf %in : f32
+    linalg.yield %10 : f32
+  } -> tensor<2xf32>
+  %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32>
+  %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %10 = math.absf %in : f32
+    linalg.yield %10 : f32
+  } -> tensor<2xf32>
+  %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32>
+  %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %8, %9 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After CanonicalizePass (iree-flow-canonicalize) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32>
+  %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32>
+  %4 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %10 = math.absf %in : f32
+    linalg.yield %10 : f32
+  } -> tensor<2xf32>
+  %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32>
+  %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %10 = math.absf %in : f32
+    linalg.yield %10 : f32
+  } -> tensor<2xf32>
+  %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32>
+  %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %8, %9 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After CSE (cse) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32>
+  %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32>
+  %4 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %10 = math.absf %in : f32
+    linalg.yield %10 : f32
+  } -> tensor<2xf32>
+  %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32>
+  %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %10 = math.absf %in : f32
+    linalg.yield %10 : f32
+  } -> tensor<2xf32>
+  %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32>
+  %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %8, %9 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After ConvertStridedContractionToContractionPass (iree-global-opt-convert-strided-contraction-to-contraction) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+    %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32>
+    %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+    %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32>
+    %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %10 = math.absf %in : f32
+      linalg.yield %10 : f32
+    } -> tensor<2xf32>
+    %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32>
+    %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %10 = math.absf %in : f32
+      linalg.yield %10 : f32
+    } -> tensor<2xf32>
+    %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32>
+    %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view
+    %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view
+    util.return %8, %9 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After GeneralizeLinalgNamedOpsPass (iree-global-opt-generalize-linalg-named-ops) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32>
+  %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32>
+  %4 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %10 = math.absf %in : f32
+    linalg.yield %10 : f32
+  } -> tensor<2xf32>
+  %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32>
+  %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %10 = math.absf %in : f32
+    linalg.yield %10 : f32
+  } -> tensor<2xf32>
+  %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32>
+  %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %8, %9 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After GlobalLoopInvariantCodeMotionPass (iree-global-opt-loop-invariant-code-motion) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32>
+  %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32>
+  %4 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %10 = math.absf %in : f32
+    linalg.yield %10 : f32
+  } -> tensor<2xf32>
+  %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32>
+  %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %10 = math.absf %in : f32
+    linalg.yield %10 : f32
+  } -> tensor<2xf32>
+  %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32>
+  %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %8, %9 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After CanonicalizePass (iree-flow-canonicalize) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32>
+  %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32>
+  %4 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %10 = math.absf %in : f32
+    linalg.yield %10 : f32
+  } -> tensor<2xf32>
+  %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32>
+  %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %10 = math.absf %in : f32
+    linalg.yield %10 : f32
+  } -> tensor<2xf32>
+  %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32>
+  %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %8, %9 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After CSE (cse) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32>
+  %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32>
+  %4 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %10 = math.absf %in : f32
+    linalg.yield %10 : f32
+  } -> tensor<2xf32>
+  %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32>
+  %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %10 = math.absf %in : f32
+    linalg.yield %10 : f32
+  } -> tensor<2xf32>
+  %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32>
+  %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %8, %9 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32>
+  %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32>
+  %4 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %10 = math.absf %in : f32
+    linalg.yield %10 : f32
+  } -> tensor<2xf32>
+  %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32>
+  %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %10 = math.absf %in : f32
+    linalg.yield %10 : f32
+  } -> tensor<2xf32>
+  %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32>
+  %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %8, %9 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32>
+  %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32>
+  %4 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %10 = math.absf %in : f32
+    linalg.yield %10 : f32
+  } -> tensor<2xf32>
+  %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32>
+  %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %10 = math.absf %in : f32
+    linalg.yield %10 : f32
+  } -> tensor<2xf32>
+  %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32>
+  %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %8, %9 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+    %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32>
+    %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+    %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32>
+    %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %10 = math.absf %in : f32
+      linalg.yield %10 : f32
+    } -> tensor<2xf32>
+    %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32>
+    %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %10 = math.absf %in : f32
+      linalg.yield %10 : f32
+    } -> tensor<2xf32>
+    %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32>
+    %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view
+    %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view
+    util.return %8, %9 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After IPOPass (iree-util-ipo) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+    %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32>
+    %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+    %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32>
+    %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %10 = math.absf %in : f32
+      linalg.yield %10 : f32
+    } -> tensor<2xf32>
+    %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32>
+    %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %10 = math.absf %in : f32
+      linalg.yield %10 : f32
+    } -> tensor<2xf32>
+    %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32>
+    %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view
+    %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view
+    util.return %8, %9 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32>
+  %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32>
+  %4 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %10 = math.absf %in : f32
+    linalg.yield %10 : f32
+  } -> tensor<2xf32>
+  %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32>
+  %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %10 = math.absf %in : f32
+    linalg.yield %10 : f32
+  } -> tensor<2xf32>
+  %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32>
+  %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %8, %9 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After CanonicalizePass (iree-flow-canonicalize) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32>
+  %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32>
+  %4 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %10 = math.absf %in : f32
+    linalg.yield %10 : f32
+  } -> tensor<2xf32>
+  %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32>
+  %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %10 = math.absf %in : f32
+    linalg.yield %10 : f32
+  } -> tensor<2xf32>
+  %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32>
+  %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %8, %9 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After CSE (cse) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32>
+  %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32>
+  %4 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %10 = math.absf %in : f32
+    linalg.yield %10 : f32
+  } -> tensor<2xf32>
+  %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32>
+  %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %10 = math.absf %in : f32
+    linalg.yield %10 : f32
+  } -> tensor<2xf32>
+  %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32>
+  %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %8, %9 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After HoistIntoGlobalsPass (iree-util-hoist-into-globals) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+    %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32>
+    %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+    %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32>
+    %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %10 = math.absf %in : f32
+      linalg.yield %10 : f32
+    } -> tensor<2xf32>
+    %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32>
+    %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %10 = math.absf %in : f32
+      linalg.yield %10 : f32
+    } -> tensor<2xf32>
+    %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32>
+    %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view
+    %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view
+    util.return %8, %9 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After JitGlobalsPass (iree-consteval-jit-globals) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+    %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32>
+    %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+    %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32>
+    %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %10 = math.absf %in : f32
+      linalg.yield %10 : f32
+    } -> tensor<2xf32>
+    %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32>
+    %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %10 = math.absf %in : f32
+      linalg.yield %10 : f32
+    } -> tensor<2xf32>
+    %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32>
+    %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view
+    %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view
+    util.return %8, %9 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After CanonicalizePass (iree-flow-canonicalize) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32>
+  %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32>
+  %4 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %10 = math.absf %in : f32
+    linalg.yield %10 : f32
+  } -> tensor<2xf32>
+  %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32>
+  %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %10 = math.absf %in : f32
+    linalg.yield %10 : f32
+  } -> tensor<2xf32>
+  %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32>
+  %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %8, %9 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After CSE (cse) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32>
+  %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32>
+  %4 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %10 = math.absf %in : f32
+    linalg.yield %10 : f32
+  } -> tensor<2xf32>
+  %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32>
+  %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %10 = math.absf %in : f32
+    linalg.yield %10 : f32
+  } -> tensor<2xf32>
+  %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32>
+  %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %8, %9 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After RaiseSpecialOpsPass (iree-global-opt-raise-special-ops) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32>
+  %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32>
+  %4 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %10 = math.absf %in : f32
+    linalg.yield %10 : f32
+  } -> tensor<2xf32>
+  %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32>
+  %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %10 = math.absf %in : f32
+    linalg.yield %10 : f32
+  } -> tensor<2xf32>
+  %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32>
+  %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %8, %9 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After InjectTensorTracingPass (iree-flow-inject-tensor-tracing) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32>
+  %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32>
+  %4 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %10 = math.absf %in : f32
+    linalg.yield %10 : f32
+  } -> tensor<2xf32>
+  %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32>
+  %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %10 = math.absf %in : f32
+    linalg.yield %10 : f32
+  } -> tensor<2xf32>
+  %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32>
+  %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %8, %9 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After TensorPadToTensorInsertSlicePass (iree-dispatch-creation-tensor-pad-to-tensor-insert-slice) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+    %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32>
+    %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+    %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32>
+    %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %10 = math.absf %in : f32
+      linalg.yield %10 : f32
+    } -> tensor<2xf32>
+    %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32>
+    %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %10 = math.absf %in : f32
+      linalg.yield %10 : f32
+    } -> tensor<2xf32>
+    %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32>
+    %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view
+    %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view
+    util.return %8, %9 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After CanonicalizePass (iree-flow-canonicalize) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32>
+  %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32>
+  %4 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %10 = math.absf %in : f32
+    linalg.yield %10 : f32
+  } -> tensor<2xf32>
+  %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32>
+  %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %10 = math.absf %in : f32
+    linalg.yield %10 : f32
+  } -> tensor<2xf32>
+  %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32>
+  %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %8, %9 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After CSE (cse) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32>
+  %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32>
+  %4 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %10 = math.absf %in : f32
+    linalg.yield %10 : f32
+  } -> tensor<2xf32>
+  %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32>
+  %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %10 = math.absf %in : f32
+    linalg.yield %10 : f32
+  } -> tensor<2xf32>
+  %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32>
+  %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %8, %9 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32>
+  %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32>
+  %4 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %10 = math.absf %in : f32
+    linalg.yield %10 : f32
+  } -> tensor<2xf32>
+  %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32>
+  %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %10 = math.absf %in : f32
+    linalg.yield %10 : f32
+  } -> tensor<2xf32>
+  %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32>
+  %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %8, %9 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32>
+  %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32>
+  %4 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %10 = math.absf %in : f32
+    linalg.yield %10 : f32
+  } -> tensor<2xf32>
+  %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32>
+  %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %10 = math.absf %in : f32
+    linalg.yield %10 : f32
+  } -> tensor<2xf32>
+  %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32>
+  %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %8, %9 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+    %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32>
+    %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+    %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32>
+    %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %10 = math.absf %in : f32
+      linalg.yield %10 : f32
+    } -> tensor<2xf32>
+    %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32>
+    %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %10 = math.absf %in : f32
+      linalg.yield %10 : f32
+    } -> tensor<2xf32>
+    %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32>
+    %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view
+    %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view
+    util.return %8, %9 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+    %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32>
+    %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+    %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32>
+    %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %10 = math.absf %in : f32
+      linalg.yield %10 : f32
+    } -> tensor<2xf32>
+    %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32>
+    %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %10 = math.absf %in : f32
+      linalg.yield %10 : f32
+    } -> tensor<2xf32>
+    %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32>
+    %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view
+    %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view
+    util.return %8, %9 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After IPOPass (iree-util-ipo) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+    %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32>
+    %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+    %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32>
+    %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %10 = math.absf %in : f32
+      linalg.yield %10 : f32
+    } -> tensor<2xf32>
+    %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32>
+    %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %10 = math.absf %in : f32
+      linalg.yield %10 : f32
+    } -> tensor<2xf32>
+    %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32>
+    %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view
+    %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view
+    util.return %8, %9 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After FixedPointIteratorPass (iree-util-fixed-point-iterator) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+    %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32>
+    %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+    %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32>
+    %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %10 = math.absf %in : f32
+      linalg.yield %10 : f32
+    } -> tensor<2xf32>
+    %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32>
+    %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %10 = math.absf %in : f32
+      linalg.yield %10 : f32
+    } -> tensor<2xf32>
+    %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32>
+    %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view
+    %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view
+    util.return %8, %9 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After FusionPreprocessingPass (iree-dispatch-creation-fusion-preprocessing) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32>
+  %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32>
+  %4 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %10 = math.absf %in : f32
+    linalg.yield %10 : f32
+  } -> tensor<2xf32>
+  %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32>
+  %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %10 = math.absf %in : f32
+    linalg.yield %10 : f32
+  } -> tensor<2xf32>
+  %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32>
+  %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %8, %9 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After CanonicalizePass (iree-flow-canonicalize) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32>
+  %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32>
+  %4 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %10 = math.absf %in : f32
+    linalg.yield %10 : f32
+  } -> tensor<2xf32>
+  %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32>
+  %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %10 = math.absf %in : f32
+    linalg.yield %10 : f32
+  } -> tensor<2xf32>
+  %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32>
+  %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %8, %9 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After CSE (cse) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32>
+  %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32>
+  %4 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %10 = math.absf %in : f32
+    linalg.yield %10 : f32
+  } -> tensor<2xf32>
+  %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32>
+  %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %10 = math.absf %in : f32
+    linalg.yield %10 : f32
+  } -> tensor<2xf32>
+  %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32>
+  %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %8, %9 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After ElementwiseOpFusionPass (iree-dispatch-creation-elementwise-op-fusion) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32>
+  %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32>
+  %4 = tensor.empty() : tensor<2xf32>
+  %5 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%4 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %12 = math.absf %in : f32
+    linalg.yield %12 : f32
+  } -> tensor<2xf32>
+  %6 = iree_tensor_ext.compute_barrier.end %5 : tensor<2xf32> -> tensor<2xf32>
+  %7 = tensor.empty() : tensor<2xf32>
+  %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%7 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %12 = math.absf %in : f32
+    linalg.yield %12 : f32
+  } -> tensor<2xf32>
+  %9 = iree_tensor_ext.compute_barrier.end %8 : tensor<2xf32> -> tensor<2xf32>
+  %10 = hal.tensor.export %6 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %11 = hal.tensor.export %9 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %10, %11 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After CanonicalizePass (iree-flow-canonicalize) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32>
+  %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32>
+  %4 = tensor.empty() : tensor<2xf32>
+  %5 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%4 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %12 = math.absf %in : f32
+    linalg.yield %12 : f32
+  } -> tensor<2xf32>
+  %6 = iree_tensor_ext.compute_barrier.end %5 : tensor<2xf32> -> tensor<2xf32>
+  %7 = tensor.empty() : tensor<2xf32>
+  %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%7 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %12 = math.absf %in : f32
+    linalg.yield %12 : f32
+  } -> tensor<2xf32>
+  %9 = iree_tensor_ext.compute_barrier.end %8 : tensor<2xf32> -> tensor<2xf32>
+  %10 = hal.tensor.export %6 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %11 = hal.tensor.export %9 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %10, %11 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After CSE (cse) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32>
+  %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32>
+  %4 = tensor.empty() : tensor<2xf32>
+  %5 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%4 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %11 = math.absf %in : f32
+    linalg.yield %11 : f32
+  } -> tensor<2xf32>
+  %6 = iree_tensor_ext.compute_barrier.end %5 : tensor<2xf32> -> tensor<2xf32>
+  %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%4 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %11 = math.absf %in : f32
+    linalg.yield %11 : f32
+  } -> tensor<2xf32>
+  %8 = iree_tensor_ext.compute_barrier.end %7 : tensor<2xf32> -> tensor<2xf32>
+  %9 = hal.tensor.export %6 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %10 = hal.tensor.export %8 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %9, %10 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After BubbleUpExpandShapesPass (iree-dispatch-creation-bubble-up-expand-shapes) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32>
+  %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32>
+  %4 = tensor.empty() : tensor<2xf32>
+  %5 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%4 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %11 = math.absf %in : f32
+    linalg.yield %11 : f32
+  } -> tensor<2xf32>
+  %6 = iree_tensor_ext.compute_barrier.end %5 : tensor<2xf32> -> tensor<2xf32>
+  %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%4 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %11 = math.absf %in : f32
+    linalg.yield %11 : f32
+  } -> tensor<2xf32>
+  %8 = iree_tensor_ext.compute_barrier.end %7 : tensor<2xf32> -> tensor<2xf32>
+  %9 = hal.tensor.export %6 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %10 = hal.tensor.export %8 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %9, %10 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After CanonicalizePass (iree-flow-canonicalize) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32>
+  %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32>
+  %4 = tensor.empty() : tensor<2xf32>
+  %5 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%4 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %11 = math.absf %in : f32
+    linalg.yield %11 : f32
+  } -> tensor<2xf32>
+  %6 = iree_tensor_ext.compute_barrier.end %5 : tensor<2xf32> -> tensor<2xf32>
+  %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%4 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %11 = math.absf %in : f32
+    linalg.yield %11 : f32
+  } -> tensor<2xf32>
+  %8 = iree_tensor_ext.compute_barrier.end %7 : tensor<2xf32> -> tensor<2xf32>
+  %9 = hal.tensor.export %6 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %10 = hal.tensor.export %8 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %9, %10 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After CSE (cse) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32>
+  %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32>
+  %4 = tensor.empty() : tensor<2xf32>
+  %5 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%4 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %11 = math.absf %in : f32
+    linalg.yield %11 : f32
+  } -> tensor<2xf32>
+  %6 = iree_tensor_ext.compute_barrier.end %5 : tensor<2xf32> -> tensor<2xf32>
+  %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%4 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %11 = math.absf %in : f32
+    linalg.yield %11 : f32
+  } -> tensor<2xf32>
+  %8 = iree_tensor_ext.compute_barrier.end %7 : tensor<2xf32> -> tensor<2xf32>
+  %9 = hal.tensor.export %6 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %10 = hal.tensor.export %8 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %9, %10 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After ElementwiseOpFusionPass (iree-dispatch-creation-elementwise-op-fusion) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32>
+  %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32>
+  %4 = tensor.empty() : tensor<2xf32>
+  %5 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%4 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %11 = math.absf %in : f32
+    linalg.yield %11 : f32
+  } -> tensor<2xf32>
+  %6 = iree_tensor_ext.compute_barrier.end %5 : tensor<2xf32> -> tensor<2xf32>
+  %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%4 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %11 = math.absf %in : f32
+    linalg.yield %11 : f32
+  } -> tensor<2xf32>
+  %8 = iree_tensor_ext.compute_barrier.end %7 : tensor<2xf32> -> tensor<2xf32>
+  %9 = hal.tensor.export %6 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %10 = hal.tensor.export %8 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %9, %10 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After CanonicalizePass (iree-flow-canonicalize) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32>
+  %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32>
+  %4 = tensor.empty() : tensor<2xf32>
+  %5 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%4 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %11 = math.absf %in : f32
+    linalg.yield %11 : f32
+  } -> tensor<2xf32>
+  %6 = iree_tensor_ext.compute_barrier.end %5 : tensor<2xf32> -> tensor<2xf32>
+  %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%4 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %11 = math.absf %in : f32
+    linalg.yield %11 : f32
+  } -> tensor<2xf32>
+  %8 = iree_tensor_ext.compute_barrier.end %7 : tensor<2xf32> -> tensor<2xf32>
+  %9 = hal.tensor.export %6 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %10 = hal.tensor.export %8 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %9, %10 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After CSE (cse) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32>
+  %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32>
+  %4 = tensor.empty() : tensor<2xf32>
+  %5 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%4 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %11 = math.absf %in : f32
+    linalg.yield %11 : f32
+  } -> tensor<2xf32>
+  %6 = iree_tensor_ext.compute_barrier.end %5 : tensor<2xf32> -> tensor<2xf32>
+  %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%4 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %11 = math.absf %in : f32
+    linalg.yield %11 : f32
+  } -> tensor<2xf32>
+  %8 = iree_tensor_ext.compute_barrier.end %7 : tensor<2xf32> -> tensor<2xf32>
+  %9 = hal.tensor.export %6 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %10 = hal.tensor.export %8 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %9, %10 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After SinkReshapesPass (iree-dispatch-creation-sink-reshapes) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32>
+  %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32>
+  %4 = tensor.empty() : tensor<2xf32>
+  %5 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%4 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %11 = math.absf %in : f32
+    linalg.yield %11 : f32
+  } -> tensor<2xf32>
+  %6 = iree_tensor_ext.compute_barrier.end %5 : tensor<2xf32> -> tensor<2xf32>
+  %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%4 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %11 = math.absf %in : f32
+    linalg.yield %11 : f32
+  } -> tensor<2xf32>
+  %8 = iree_tensor_ext.compute_barrier.end %7 : tensor<2xf32> -> tensor<2xf32>
+  %9 = hal.tensor.export %6 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %10 = hal.tensor.export %8 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %9, %10 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After CanonicalizePass (iree-flow-canonicalize) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32>
+  %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32>
+  %4 = tensor.empty() : tensor<2xf32>
+  %5 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%4 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %11 = math.absf %in : f32
+    linalg.yield %11 : f32
+  } -> tensor<2xf32>
+  %6 = iree_tensor_ext.compute_barrier.end %5 : tensor<2xf32> -> tensor<2xf32>
+  %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%4 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %11 = math.absf %in : f32
+    linalg.yield %11 : f32
+  } -> tensor<2xf32>
+  %8 = iree_tensor_ext.compute_barrier.end %7 : tensor<2xf32> -> tensor<2xf32>
+  %9 = hal.tensor.export %6 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %10 = hal.tensor.export %8 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %9, %10 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After CSE (cse) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32>
+  %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32>
+  %4 = tensor.empty() : tensor<2xf32>
+  %5 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%4 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %11 = math.absf %in : f32
+    linalg.yield %11 : f32
+  } -> tensor<2xf32>
+  %6 = iree_tensor_ext.compute_barrier.end %5 : tensor<2xf32> -> tensor<2xf32>
+  %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%4 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %11 = math.absf %in : f32
+    linalg.yield %11 : f32
+  } -> tensor<2xf32>
+  %8 = iree_tensor_ext.compute_barrier.end %7 : tensor<2xf32> -> tensor<2xf32>
+  %9 = hal.tensor.export %6 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %10 = hal.tensor.export %8 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %9, %10 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After FuseMultiUseElementwiseProducerPass (iree-dispatch-creation-fuse-multi-use-elementwise-producer) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32>
+  %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32>
+  %4 = tensor.empty() : tensor<2xf32>
+  %5 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%4 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %11 = math.absf %in : f32
+    linalg.yield %11 : f32
+  } -> tensor<2xf32>
+  %6 = iree_tensor_ext.compute_barrier.end %5 : tensor<2xf32> -> tensor<2xf32>
+  %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%4 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %11 = math.absf %in : f32
+    linalg.yield %11 : f32
+  } -> tensor<2xf32>
+  %8 = iree_tensor_ext.compute_barrier.end %7 : tensor<2xf32> -> tensor<2xf32>
+  %9 = hal.tensor.export %6 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %10 = hal.tensor.export %8 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %9, %10 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After CanonicalizePass (iree-flow-canonicalize) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32>
+  %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32>
+  %4 = tensor.empty() : tensor<2xf32>
+  %5 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%4 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %11 = math.absf %in : f32
+    linalg.yield %11 : f32
+  } -> tensor<2xf32>
+  %6 = iree_tensor_ext.compute_barrier.end %5 : tensor<2xf32> -> tensor<2xf32>
+  %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%4 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %11 = math.absf %in : f32
+    linalg.yield %11 : f32
+  } -> tensor<2xf32>
+  %8 = iree_tensor_ext.compute_barrier.end %7 : tensor<2xf32> -> tensor<2xf32>
+  %9 = hal.tensor.export %6 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %10 = hal.tensor.export %8 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %9, %10 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After CSE (cse) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32>
+  %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32>
+  %4 = tensor.empty() : tensor<2xf32>
+  %5 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%4 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %11 = math.absf %in : f32
+    linalg.yield %11 : f32
+  } -> tensor<2xf32>
+  %6 = iree_tensor_ext.compute_barrier.end %5 : tensor<2xf32> -> tensor<2xf32>
+  %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%4 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %11 = math.absf %in : f32
+    linalg.yield %11 : f32
+  } -> tensor<2xf32>
+  %8 = iree_tensor_ext.compute_barrier.end %7 : tensor<2xf32> -> tensor<2xf32>
+  %9 = hal.tensor.export %6 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %10 = hal.tensor.export %8 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %9, %10 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After SplitReductionPass (iree-dispatch-creation-split-reduction-ops) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32>
+  %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32>
+  %4 = tensor.empty() : tensor<2xf32>
+  %5 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%4 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %11 = math.absf %in : f32
+    linalg.yield %11 : f32
+  } -> tensor<2xf32>
+  %6 = iree_tensor_ext.compute_barrier.end %5 : tensor<2xf32> -> tensor<2xf32>
+  %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%4 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %11 = math.absf %in : f32
+    linalg.yield %11 : f32
+  } -> tensor<2xf32>
+  %8 = iree_tensor_ext.compute_barrier.end %7 : tensor<2xf32> -> tensor<2xf32>
+  %9 = hal.tensor.export %6 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %10 = hal.tensor.export %8 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %9, %10 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After FormSplitReductionDispatchesPass (iree-dispatch-creation-form-split-reduction-dispatches) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32>
+  %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32>
+  %4 = tensor.empty() : tensor<2xf32>
+  %5 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%4 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %11 = math.absf %in : f32
+    linalg.yield %11 : f32
+  } -> tensor<2xf32>
+  %6 = iree_tensor_ext.compute_barrier.end %5 : tensor<2xf32> -> tensor<2xf32>
+  %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%4 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %11 = math.absf %in : f32
+    linalg.yield %11 : f32
+  } -> tensor<2xf32>
+  %8 = iree_tensor_ext.compute_barrier.end %7 : tensor<2xf32> -> tensor<2xf32>
+  %9 = hal.tensor.export %6 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %10 = hal.tensor.export %8 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %9, %10 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After TransposeGenericOpsPass (iree-dispatch-creation-transpose-generic-ops) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32>
+  %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32>
+  %4 = tensor.empty() : tensor<2xf32>
+  %5 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%4 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %11 = math.absf %in : f32
+    linalg.yield %11 : f32
+  } -> tensor<2xf32>
+  %6 = iree_tensor_ext.compute_barrier.end %5 : tensor<2xf32> -> tensor<2xf32>
+  %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%4 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %11 = math.absf %in : f32
+    linalg.yield %11 : f32
+  } -> tensor<2xf32>
+  %8 = iree_tensor_ext.compute_barrier.end %7 : tensor<2xf32> -> tensor<2xf32>
+  %9 = hal.tensor.export %6 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %10 = hal.tensor.export %8 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %9, %10 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After PropagateEncodingsPass (iree-dispatch-creation-propagate-encodings) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32>
+  %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32>
+  %4 = tensor.empty() : tensor<2xf32>
+  %5 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%4 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %11 = math.absf %in : f32
+    linalg.yield %11 : f32
+  } -> tensor<2xf32>
+  %6 = iree_tensor_ext.compute_barrier.end %5 : tensor<2xf32> -> tensor<2xf32>
+  %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%4 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %11 = math.absf %in : f32
+    linalg.yield %11 : f32
+  } -> tensor<2xf32>
+  %8 = iree_tensor_ext.compute_barrier.end %7 : tensor<2xf32> -> tensor<2xf32>
+  %9 = hal.tensor.export %6 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %10 = hal.tensor.export %8 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %9, %10 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After HoistIntoGlobalsPass (iree-util-hoist-into-globals) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+    %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32>
+    %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+    %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32>
+    %4 = tensor.empty() : tensor<2xf32>
+    %5 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%4 : tensor<2xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %11 = math.absf %in : f32
+      linalg.yield %11 : f32
+    } -> tensor<2xf32>
+    %6 = iree_tensor_ext.compute_barrier.end %5 : tensor<2xf32> -> tensor<2xf32>
+    %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%4 : tensor<2xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %11 = math.absf %in : f32
+      linalg.yield %11 : f32
+    } -> tensor<2xf32>
+    %8 = iree_tensor_ext.compute_barrier.end %7 : tensor<2xf32> -> tensor<2xf32>
+    %9 = hal.tensor.export %6 "output0" : tensor<2xf32> -> !hal.buffer_view
+    %10 = hal.tensor.export %8 "output1" : tensor<2xf32> -> !hal.buffer_view
+    util.return %9, %10 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After CanonicalizePass (iree-flow-canonicalize) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32>
+  %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32>
+  %4 = tensor.empty() : tensor<2xf32>
+  %5 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%4 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %11 = math.absf %in : f32
+    linalg.yield %11 : f32
+  } -> tensor<2xf32>
+  %6 = iree_tensor_ext.compute_barrier.end %5 : tensor<2xf32> -> tensor<2xf32>
+  %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%4 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %11 = math.absf %in : f32
+    linalg.yield %11 : f32
+  } -> tensor<2xf32>
+  %8 = iree_tensor_ext.compute_barrier.end %7 : tensor<2xf32> -> tensor<2xf32>
+  %9 = hal.tensor.export %6 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %10 = hal.tensor.export %8 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %9, %10 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After CSE (cse) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32>
+  %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32>
+  %4 = tensor.empty() : tensor<2xf32>
+  %5 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%4 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %11 = math.absf %in : f32
+    linalg.yield %11 : f32
+  } -> tensor<2xf32>
+  %6 = iree_tensor_ext.compute_barrier.end %5 : tensor<2xf32> -> tensor<2xf32>
+  %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%4 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %11 = math.absf %in : f32
+    linalg.yield %11 : f32
+  } -> tensor<2xf32>
+  %8 = iree_tensor_ext.compute_barrier.end %7 : tensor<2xf32> -> tensor<2xf32>
+  %9 = hal.tensor.export %6 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %10 = hal.tensor.export %8 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %9, %10 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After FormScalarDispatchesPass (iree-dispatch-creation-form-scalar-dispatches) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32>
+  %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32>
+  %4 = tensor.empty() : tensor<2xf32>
+  %5 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%4 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %11 = math.absf %in : f32
+    linalg.yield %11 : f32
+  } -> tensor<2xf32>
+  %6 = iree_tensor_ext.compute_barrier.end %5 : tensor<2xf32> -> tensor<2xf32>
+  %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%4 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %11 = math.absf %in : f32
+    linalg.yield %11 : f32
+  } -> tensor<2xf32>
+  %8 = iree_tensor_ext.compute_barrier.end %7 : tensor<2xf32> -> tensor<2xf32>
+  %9 = hal.tensor.export %6 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %10 = hal.tensor.export %8 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %9, %10 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After FormDispatchRegionsPass (iree-dispatch-creation-form-dispatch-regions) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32>
+  %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32>
+  %4 = tensor.empty() : tensor<2xf32>
+  %5 = flow.dispatch.region -> (tensor<2xf32>) {
+    %11 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%4 : tensor<2xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %12 = math.absf %in : f32
+      linalg.yield %12 : f32
+    } -> tensor<2xf32>
+    flow.return %11 : tensor<2xf32>
+  }
+  %6 = iree_tensor_ext.compute_barrier.end %5 : tensor<2xf32> -> tensor<2xf32>
+  %7 = flow.dispatch.region -> (tensor<2xf32>) {
+    %11 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%4 : tensor<2xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %12 = math.absf %in : f32
+      linalg.yield %12 : f32
+    } -> tensor<2xf32>
+    flow.return %11 : tensor<2xf32>
+  }
+  %8 = iree_tensor_ext.compute_barrier.end %7 : tensor<2xf32> -> tensor<2xf32>
+  %9 = hal.tensor.export %6 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %10 = hal.tensor.export %8 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %9, %10 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After ElementwiseOpFusionPass (iree-dispatch-creation-elementwise-op-fusion) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32>
+  %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32>
+  %4 = tensor.empty() : tensor<2xf32>
+  %5 = flow.dispatch.region -> (tensor<2xf32>) {
+    %11 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%4 : tensor<2xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %12 = math.absf %in : f32
+      linalg.yield %12 : f32
+    } -> tensor<2xf32>
+    flow.return %11 : tensor<2xf32>
+  }
+  %6 = iree_tensor_ext.compute_barrier.end %5 : tensor<2xf32> -> tensor<2xf32>
+  %7 = flow.dispatch.region -> (tensor<2xf32>) {
+    %11 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%4 : tensor<2xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %12 = math.absf %in : f32
+      linalg.yield %12 : f32
+    } -> tensor<2xf32>
+    flow.return %11 : tensor<2xf32>
+  }
+  %8 = iree_tensor_ext.compute_barrier.end %7 : tensor<2xf32> -> tensor<2xf32>
+  %9 = hal.tensor.export %6 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %10 = hal.tensor.export %8 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %9, %10 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After FuseMultiUseElementwiseProducerPass (iree-dispatch-creation-fuse-multi-use-elementwise-producer) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32>
+  %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32>
+  %4 = tensor.empty() : tensor<2xf32>
+  %5 = flow.dispatch.region -> (tensor<2xf32>) {
+    %11 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%4 : tensor<2xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %12 = math.absf %in : f32
+      linalg.yield %12 : f32
+    } -> tensor<2xf32>
+    flow.return %11 : tensor<2xf32>
+  }
+  %6 = iree_tensor_ext.compute_barrier.end %5 : tensor<2xf32> -> tensor<2xf32>
+  %7 = flow.dispatch.region -> (tensor<2xf32>) {
+    %11 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%4 : tensor<2xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %12 = math.absf %in : f32
+      linalg.yield %12 : f32
+    } -> tensor<2xf32>
+    flow.return %11 : tensor<2xf32>
+  }
+  %8 = iree_tensor_ext.compute_barrier.end %7 : tensor<2xf32> -> tensor<2xf32>
+  %9 = hal.tensor.export %6 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %10 = hal.tensor.export %8 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %9, %10 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After CloneProducersIntoDispatchRegionsPass (iree-dispatch-creation-clone-producers-into-dispatch-regions) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32>
+  %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32>
+  %4 = flow.dispatch.region -> (tensor<2xf32>) {
+    %10 = tensor.empty() : tensor<2xf32>
+    %11 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%10 : tensor<2xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %12 = math.absf %in : f32
+      linalg.yield %12 : f32
+    } -> tensor<2xf32>
+    flow.return %11 : tensor<2xf32>
+  }
+  %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32>
+  %6 = flow.dispatch.region -> (tensor<2xf32>) {
+    %10 = tensor.empty() : tensor<2xf32>
+    %11 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%10 : tensor<2xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %12 = math.absf %in : f32
+      linalg.yield %12 : f32
+    } -> tensor<2xf32>
+    flow.return %11 : tensor<2xf32>
+  }
+  %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32>
+  %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %8, %9 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After CollapseDimensionsPass (iree-dispatch-creation-collapse-dimensions) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32>
+  %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32>
+  %4 = flow.dispatch.region -> (tensor<2xf32>) {
+    %10 = tensor.empty() : tensor<2xf32>
+    %11 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%10 : tensor<2xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %12 = math.absf %in : f32
+      linalg.yield %12 : f32
+    } -> tensor<2xf32>
+    flow.return %11 : tensor<2xf32>
+  }
+  %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32>
+  %6 = flow.dispatch.region -> (tensor<2xf32>) {
+    %10 = tensor.empty() : tensor<2xf32>
+    %11 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%10 : tensor<2xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %12 = math.absf %in : f32
+      linalg.yield %12 : f32
+    } -> tensor<2xf32>
+    flow.return %11 : tensor<2xf32>
+  }
+  %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32>
+  %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %8, %9 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After HoistUniformScalarComputePass (iree-dispatch-creation-hoist-uniform-scalar-compute) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32>
+  %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32>
+  %4 = flow.dispatch.region -> (tensor<2xf32>) {
+    %10 = tensor.empty() : tensor<2xf32>
+    %11 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%10 : tensor<2xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %12 = math.absf %in : f32
+      linalg.yield %12 : f32
+    } -> tensor<2xf32>
+    flow.return %11 : tensor<2xf32>
+  }
+  %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32>
+  %6 = flow.dispatch.region -> (tensor<2xf32>) {
+    %10 = tensor.empty() : tensor<2xf32>
+    %11 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%10 : tensor<2xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %12 = math.absf %in : f32
+      linalg.yield %12 : f32
+    } -> tensor<2xf32>
+    flow.return %11 : tensor<2xf32>
+  }
+  %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32>
+  %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %8, %9 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After FuseEncodingOpsIntoDispatchRegionsPass (iree-dispatch-creation-fuse-encoding-ops-into-dispatch-regions-pass) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32>
+  %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32>
+  %4 = flow.dispatch.region -> (tensor<2xf32>) {
+    %10 = tensor.empty() : tensor<2xf32>
+    %11 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%10 : tensor<2xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %12 = math.absf %in : f32
+      linalg.yield %12 : f32
+    } -> tensor<2xf32>
+    flow.return %11 : tensor<2xf32>
+  }
+  %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32>
+  %6 = flow.dispatch.region -> (tensor<2xf32>) {
+    %10 = tensor.empty() : tensor<2xf32>
+    %11 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%10 : tensor<2xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %12 = math.absf %in : f32
+      linalg.yield %12 : f32
+    } -> tensor<2xf32>
+    flow.return %11 : tensor<2xf32>
+  }
+  %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32>
+  %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %8, %9 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After ConvertEncodingToFlowPass (iree-dispatch-creation-convert-encoding-to-flow) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32>
+  %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32>
+  %4 = flow.dispatch.region -> (tensor<2xf32>) {
+    %10 = tensor.empty() : tensor<2xf32>
+    %11 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%10 : tensor<2xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %12 = math.absf %in : f32
+      linalg.yield %12 : f32
+    } -> tensor<2xf32>
+    flow.return %11 : tensor<2xf32>
+  }
+  %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32>
+  %6 = flow.dispatch.region -> (tensor<2xf32>) {
+    %10 = tensor.empty() : tensor<2xf32>
+    %11 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%10 : tensor<2xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %12 = math.absf %in : f32
+      linalg.yield %12 : f32
+    } -> tensor<2xf32>
+    flow.return %11 : tensor<2xf32>
+  }
+  %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32>
+  %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %8, %9 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After HoistIntoGlobalsPass (iree-util-hoist-into-globals) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+    %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32>
+    %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+    %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32>
+    %4 = flow.dispatch.region -> (tensor<2xf32>) {
+      %10 = tensor.empty() : tensor<2xf32>
+      %11 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%10 : tensor<2xf32>) {
+      ^bb0(%in: f32, %out: f32):
+        %12 = math.absf %in : f32
+        linalg.yield %12 : f32
+      } -> tensor<2xf32>
+      flow.return %11 : tensor<2xf32>
+    }
+    %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32>
+    %6 = flow.dispatch.region -> (tensor<2xf32>) {
+      %10 = tensor.empty() : tensor<2xf32>
+      %11 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%10 : tensor<2xf32>) {
+      ^bb0(%in: f32, %out: f32):
+        %12 = math.absf %in : f32
+        linalg.yield %12 : f32
+      } -> tensor<2xf32>
+      flow.return %11 : tensor<2xf32>
+    }
+    %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32>
+    %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view
+    %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view
+    util.return %8, %9 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After RemoveTensorBarriersPass (iree-dispatch-creation-remove-tensor-barriers) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %2 = flow.dispatch.region -> (tensor<2xf32>) {
+    %6 = tensor.empty() : tensor<2xf32>
+    %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<2xf32>) outs(%6 : tensor<2xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %8 = math.absf %in : f32
+      linalg.yield %8 : f32
+    } -> tensor<2xf32>
+    flow.return %7 : tensor<2xf32>
+  }
+  %3 = flow.dispatch.region -> (tensor<2xf32>) {
+    %6 = tensor.empty() : tensor<2xf32>
+    %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%6 : tensor<2xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %8 = math.absf %in : f32
+      linalg.yield %8 : f32
+    } -> tensor<2xf32>
+    flow.return %7 : tensor<2xf32>
+  }
+  %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %4, %5 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After ConvertDispatchRegionsToWorkgroupsPass (iree-dispatch-creation-convert-dispatch-regions-to-workgroups) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %2 = flow.dispatch.workgroups(%0) : (tensor<2xf32>) -> tensor<2xf32> =
+      (%arg2: !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>, %arg3: !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>) {
+    %6 = iree_tensor_ext.dispatch.tensor.load %arg2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+    %7 = tensor.empty() : tensor<2xf32>
+    %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%6 : tensor<2xf32>) outs(%7 : tensor<2xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %9 = math.absf %in : f32
+      linalg.yield %9 : f32
+    } -> tensor<2xf32>
+    iree_tensor_ext.dispatch.tensor.store %8, %arg3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+    flow.return
+  }
+  %3 = flow.dispatch.workgroups(%1) : (tensor<2xf32>) -> tensor<2xf32> =
+      (%arg2: !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>, %arg3: !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>) {
+    %6 = iree_tensor_ext.dispatch.tensor.load %arg2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+    %7 = tensor.empty() : tensor<2xf32>
+    %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%6 : tensor<2xf32>) outs(%7 : tensor<2xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %9 = math.absf %in : f32
+      linalg.yield %9 : f32
+    } -> tensor<2xf32>
+    iree_tensor_ext.dispatch.tensor.store %8, %arg3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+    flow.return
+  }
+  %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %4, %5 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After ConvertTensorToFlowPass (iree-dispatch-creation-convert-tensor-to-flow) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %2 = flow.dispatch.workgroups(%0) : (tensor<2xf32>) -> tensor<2xf32> =
+      (%arg2: !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>, %arg3: !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>) {
+    %6 = iree_tensor_ext.dispatch.tensor.load %arg2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+    %7 = tensor.empty() : tensor<2xf32>
+    %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%6 : tensor<2xf32>) outs(%7 : tensor<2xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %9 = math.absf %in : f32
+      linalg.yield %9 : f32
+    } -> tensor<2xf32>
+    iree_tensor_ext.dispatch.tensor.store %8, %arg3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+    flow.return
+  }
+  %3 = flow.dispatch.workgroups(%1) : (tensor<2xf32>) -> tensor<2xf32> =
+      (%arg2: !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>, %arg3: !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>) {
+    %6 = iree_tensor_ext.dispatch.tensor.load %arg2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+    %7 = tensor.empty() : tensor<2xf32>
+    %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%6 : tensor<2xf32>) outs(%7 : tensor<2xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %9 = math.absf %in : f32
+      linalg.yield %9 : f32
+    } -> tensor<2xf32>
+    iree_tensor_ext.dispatch.tensor.store %8, %arg3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+    flow.return
+  }
+  %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %4, %5 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After CSE (cse) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %2 = flow.dispatch.workgroups(%0) : (tensor<2xf32>) -> tensor<2xf32> =
+      (%arg2: !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>, %arg3: !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>) {
+    %6 = iree_tensor_ext.dispatch.tensor.load %arg2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+    %7 = tensor.empty() : tensor<2xf32>
+    %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%6 : tensor<2xf32>) outs(%7 : tensor<2xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %9 = math.absf %in : f32
+      linalg.yield %9 : f32
+    } -> tensor<2xf32>
+    iree_tensor_ext.dispatch.tensor.store %8, %arg3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+    flow.return
+  }
+  %3 = flow.dispatch.workgroups(%1) : (tensor<2xf32>) -> tensor<2xf32> =
+      (%arg2: !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>, %arg3: !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>) {
+    %6 = iree_tensor_ext.dispatch.tensor.load %arg2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+    %7 = tensor.empty() : tensor<2xf32>
+    %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%6 : tensor<2xf32>) outs(%7 : tensor<2xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %9 = math.absf %in : f32
+      linalg.yield %9 : f32
+    } -> tensor<2xf32>
+    iree_tensor_ext.dispatch.tensor.store %8, %arg3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+    flow.return
+  }
+  %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %4, %5 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After CanonicalizePass (iree-flow-canonicalize) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %2 = flow.dispatch.workgroups(%0) : (tensor<2xf32>) -> tensor<2xf32> =
+      (%arg2: !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>, %arg3: !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>) {
+    %6 = iree_tensor_ext.dispatch.tensor.load %arg2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+    %7 = tensor.empty() : tensor<2xf32>
+    %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%6 : tensor<2xf32>) outs(%7 : tensor<2xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %9 = math.absf %in : f32
+      linalg.yield %9 : f32
+    } -> tensor<2xf32>
+    iree_tensor_ext.dispatch.tensor.store %8, %arg3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+    flow.return
+  }
+  %3 = flow.dispatch.workgroups(%1) : (tensor<2xf32>) -> tensor<2xf32> =
+      (%arg2: !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>, %arg3: !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>) {
+    %6 = iree_tensor_ext.dispatch.tensor.load %arg2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+    %7 = tensor.empty() : tensor<2xf32>
+    %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%6 : tensor<2xf32>) outs(%7 : tensor<2xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %9 = math.absf %in : f32
+      linalg.yield %9 : f32
+    } -> tensor<2xf32>
+    iree_tensor_ext.dispatch.tensor.store %8, %arg3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+    flow.return
+  }
+  %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %4, %5 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After MaterializeDefaultWorkgroupCountRegionPass (iree-dispatch-creation-materialize-default-workgroup-count-region) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %2 = flow.dispatch.workgroups(%0) : (tensor<2xf32>) -> tensor<2xf32> =
+      (%arg2: !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>, %arg3: !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>) {
+    %6 = iree_tensor_ext.dispatch.tensor.load %arg2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+    %7 = tensor.empty() : tensor<2xf32>
+    %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%6 : tensor<2xf32>) outs(%7 : tensor<2xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %9 = math.absf %in : f32
+      linalg.yield %9 : f32
+    } -> tensor<2xf32>
+    iree_tensor_ext.dispatch.tensor.store %8, %arg3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+    flow.return
+  } count() -> (index, index, index) {
+    %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+    flow.return %x, %y, %z : index, index, index
+  }
+  %3 = flow.dispatch.workgroups(%1) : (tensor<2xf32>) -> tensor<2xf32> =
+      (%arg2: !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>, %arg3: !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>) {
+    %6 = iree_tensor_ext.dispatch.tensor.load %arg2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+    %7 = tensor.empty() : tensor<2xf32>
+    %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%6 : tensor<2xf32>) outs(%7 : tensor<2xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %9 = math.absf %in : f32
+      linalg.yield %9 : f32
+    } -> tensor<2xf32>
+    iree_tensor_ext.dispatch.tensor.store %8, %arg3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+    flow.return
+  } count() -> (index, index, index) {
+    %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+    flow.return %x, %y, %z : index, index, index
+  }
+  %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %4, %5 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After BitcastUnsupportedElementTypesPass (iree-dispatch-creation-bitcast-unsupported-element-types) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %2 = flow.dispatch.workgroups(%0) : (tensor<2xf32>) -> tensor<2xf32> =
+      (%arg2: !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>, %arg3: !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>) {
+    %6 = iree_tensor_ext.dispatch.tensor.load %arg2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+    %7 = tensor.empty() : tensor<2xf32>
+    %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%6 : tensor<2xf32>) outs(%7 : tensor<2xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %9 = math.absf %in : f32
+      linalg.yield %9 : f32
+    } -> tensor<2xf32>
+    iree_tensor_ext.dispatch.tensor.store %8, %arg3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+    flow.return
+  } count() -> (index, index, index) {
+    %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+    flow.return %x, %y, %z : index, index, index
+  }
+  %3 = flow.dispatch.workgroups(%1) : (tensor<2xf32>) -> tensor<2xf32> =
+      (%arg2: !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>, %arg3: !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>) {
+    %6 = iree_tensor_ext.dispatch.tensor.load %arg2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+    %7 = tensor.empty() : tensor<2xf32>
+    %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%6 : tensor<2xf32>) outs(%7 : tensor<2xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %9 = math.absf %in : f32
+      linalg.yield %9 : f32
+    } -> tensor<2xf32>
+    iree_tensor_ext.dispatch.tensor.store %8, %arg3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+    flow.return
+  } count() -> (index, index, index) {
+    %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+    flow.return %x, %y, %z : index, index, index
+  }
+  %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %4, %5 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After CSE (cse) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %2 = flow.dispatch.workgroups(%0) : (tensor<2xf32>) -> tensor<2xf32> =
+      (%arg2: !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>, %arg3: !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>) {
+    %6 = iree_tensor_ext.dispatch.tensor.load %arg2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+    %7 = tensor.empty() : tensor<2xf32>
+    %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%6 : tensor<2xf32>) outs(%7 : tensor<2xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %9 = math.absf %in : f32
+      linalg.yield %9 : f32
+    } -> tensor<2xf32>
+    iree_tensor_ext.dispatch.tensor.store %8, %arg3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+    flow.return
+  } count() -> (index, index, index) {
+    %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+    flow.return %x, %y, %z : index, index, index
+  }
+  %3 = flow.dispatch.workgroups(%1) : (tensor<2xf32>) -> tensor<2xf32> =
+      (%arg2: !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>, %arg3: !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>) {
+    %6 = iree_tensor_ext.dispatch.tensor.load %arg2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+    %7 = tensor.empty() : tensor<2xf32>
+    %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%6 : tensor<2xf32>) outs(%7 : tensor<2xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %9 = math.absf %in : f32
+      linalg.yield %9 : f32
+    } -> tensor<2xf32>
+    iree_tensor_ext.dispatch.tensor.store %8, %arg3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+    flow.return
+  } count() -> (index, index, index) {
+    %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+    flow.return %x, %y, %z : index, index, index
+  }
+  %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %4, %5 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After CanonicalizePass (iree-flow-canonicalize) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %2 = flow.dispatch.workgroups(%0) : (tensor<2xf32>) -> tensor<2xf32> =
+      (%arg2: !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>, %arg3: !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>) {
+    %6 = iree_tensor_ext.dispatch.tensor.load %arg2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+    %7 = tensor.empty() : tensor<2xf32>
+    %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%6 : tensor<2xf32>) outs(%7 : tensor<2xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %9 = math.absf %in : f32
+      linalg.yield %9 : f32
+    } -> tensor<2xf32>
+    iree_tensor_ext.dispatch.tensor.store %8, %arg3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+    flow.return
+  } count() -> (index, index, index) {
+    %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+    flow.return %x, %y, %z : index, index, index
+  }
+  %3 = flow.dispatch.workgroups(%1) : (tensor<2xf32>) -> tensor<2xf32> =
+      (%arg2: !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>, %arg3: !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>) {
+    %6 = iree_tensor_ext.dispatch.tensor.load %arg2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+    %7 = tensor.empty() : tensor<2xf32>
+    %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%6 : tensor<2xf32>) outs(%7 : tensor<2xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %9 = math.absf %in : f32
+      linalg.yield %9 : f32
+    } -> tensor<2xf32>
+    iree_tensor_ext.dispatch.tensor.store %8, %arg3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+    flow.return
+  } count() -> (index, index, index) {
+    %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+    flow.return %x, %y, %z : index, index, index
+  }
+  %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %4, %5 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After VerifyInputLegalityPass (iree-verify-input-legality) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+    %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+    %2 = flow.dispatch.workgroups(%0) : (tensor<2xf32>) -> tensor<2xf32> =
+        (%arg2: !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>, %arg3: !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>) {
+      %6 = iree_tensor_ext.dispatch.tensor.load %arg2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+      %7 = tensor.empty() : tensor<2xf32>
+      %8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%6 : tensor<2xf32>) outs(%7 : tensor<2xf32>) {
+      ^bb0(%in: f32, %out: f32):
+        %9 = math.absf %in : f32
+        linalg.yield %9 : f32
+      } -> tensor<2xf32>
+      iree_tensor_ext.dispatch.tensor.store %8, %arg3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+      flow.return
+    } count() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      flow.return %x, %y, %z : index, index, index
+    }
+    %3 = flow.dispatch.workgroups(%1) : (tensor<2xf32>) -> tensor<2xf32> =
+        (%arg2: !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>, %arg3: !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>) {
+      %6 = iree_tensor_ext.dispatch.tensor.load %arg2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+      %7 = tensor.empty() : tensor<2xf32>
+      %8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%6 : tensor<2xf32>) outs(%7 : tensor<2xf32>) {
+      ^bb0(%in: f32, %out: f32):
+        %9 = math.absf %in : f32
+        linalg.yield %9 : f32
+      } -> tensor<2xf32>
+      iree_tensor_ext.dispatch.tensor.store %8, %arg3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+      flow.return
+    } count() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      flow.return %x, %y, %z : index, index, index
+    }
+    %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view
+    %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view
+    util.return %4, %5 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After VerifyInitializationOrderPass (iree-util-verify-initialization-order) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+    %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+    %2 = flow.dispatch.workgroups(%0) : (tensor<2xf32>) -> tensor<2xf32> =
+        (%arg2: !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>, %arg3: !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>) {
+      %6 = iree_tensor_ext.dispatch.tensor.load %arg2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+      %7 = tensor.empty() : tensor<2xf32>
+      %8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%6 : tensor<2xf32>) outs(%7 : tensor<2xf32>) {
+      ^bb0(%in: f32, %out: f32):
+        %9 = math.absf %in : f32
+        linalg.yield %9 : f32
+      } -> tensor<2xf32>
+      iree_tensor_ext.dispatch.tensor.store %8, %arg3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+      flow.return
+    } count() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      flow.return %x, %y, %z : index, index, index
+    }
+    %3 = flow.dispatch.workgroups(%1) : (tensor<2xf32>) -> tensor<2xf32> =
+        (%arg2: !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>, %arg3: !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>) {
+      %6 = iree_tensor_ext.dispatch.tensor.load %arg2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+      %7 = tensor.empty() : tensor<2xf32>
+      %8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%6 : tensor<2xf32>) outs(%7 : tensor<2xf32>) {
+      ^bb0(%in: f32, %out: f32):
+        %9 = math.absf %in : f32
+        linalg.yield %9 : f32
+      } -> tensor<2xf32>
+      iree_tensor_ext.dispatch.tensor.store %8, %arg3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+      flow.return
+    } count() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      flow.return %x, %y, %z : index, index, index
+    }
+    %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view
+    %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view
+    util.return %4, %5 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After AttributeCallGraphPass (iree-util-attribute-call-graph) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+    %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+    %2 = flow.dispatch.workgroups(%0) : (tensor<2xf32>) -> tensor<2xf32> =
+        (%arg2: !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>, %arg3: !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>) {
+      %6 = iree_tensor_ext.dispatch.tensor.load %arg2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+      %7 = tensor.empty() : tensor<2xf32>
+      %8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%6 : tensor<2xf32>) outs(%7 : tensor<2xf32>) {
+      ^bb0(%in: f32, %out: f32):
+        %9 = math.absf %in : f32
+        linalg.yield %9 : f32
+      } -> tensor<2xf32>
+      iree_tensor_ext.dispatch.tensor.store %8, %arg3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+      flow.return
+    } count() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      flow.return %x, %y, %z : index, index, index
+    }
+    %3 = flow.dispatch.workgroups(%1) : (tensor<2xf32>) -> tensor<2xf32> =
+        (%arg2: !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>, %arg3: !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>) {
+      %6 = iree_tensor_ext.dispatch.tensor.load %arg2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+      %7 = tensor.empty() : tensor<2xf32>
+      %8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%6 : tensor<2xf32>) outs(%7 : tensor<2xf32>) {
+      ^bb0(%in: f32, %out: f32):
+        %9 = math.absf %in : f32
+        linalg.yield %9 : f32
+      } -> tensor<2xf32>
+      iree_tensor_ext.dispatch.tensor.store %8, %arg3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+      flow.return
+    } count() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      flow.return %x, %y, %z : index, index, index
+    }
+    %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view
+    %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view
+    util.return %4, %5 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After InitializeEmptyTensorsPass (iree-flow-initialize-empty-tensors) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %2 = flow.dispatch.workgroups(%0) : (tensor<2xf32>) -> tensor<2xf32> =
+      (%arg2: !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>, %arg3: !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>) {
+    %6 = iree_tensor_ext.dispatch.tensor.load %arg2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+    %7 = tensor.empty() : tensor<2xf32>
+    %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%6 : tensor<2xf32>) outs(%7 : tensor<2xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %9 = math.absf %in : f32
+      linalg.yield %9 : f32
+    } -> tensor<2xf32>
+    iree_tensor_ext.dispatch.tensor.store %8, %arg3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+    flow.return
+  } count() -> (index, index, index) {
+    %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+    flow.return %x, %y, %z : index, index, index
+  }
+  %3 = flow.dispatch.workgroups(%1) : (tensor<2xf32>) -> tensor<2xf32> =
+      (%arg2: !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>, %arg3: !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>) {
+    %6 = iree_tensor_ext.dispatch.tensor.load %arg2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+    %7 = tensor.empty() : tensor<2xf32>
+    %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%6 : tensor<2xf32>) outs(%7 : tensor<2xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %9 = math.absf %in : f32
+      linalg.yield %9 : f32
+    } -> tensor<2xf32>
+    iree_tensor_ext.dispatch.tensor.store %8, %arg3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+    flow.return
+  } count() -> (index, index, index) {
+    %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+    flow.return %x, %y, %z : index, index, index
+  }
+  %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %4, %5 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After CaptureDynamicDimsPass (iree-flow-capture-dynamic-dims) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %2 = flow.dispatch.workgroups(%0) : (tensor<2xf32>) -> tensor<2xf32> =
+      (%arg2: !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>, %arg3: !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>) {
+    %6 = iree_tensor_ext.dispatch.tensor.load %arg2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+    %7 = tensor.empty() : tensor<2xf32>
+    %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%6 : tensor<2xf32>) outs(%7 : tensor<2xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %9 = math.absf %in : f32
+      linalg.yield %9 : f32
+    } -> tensor<2xf32>
+    iree_tensor_ext.dispatch.tensor.store %8, %arg3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+    flow.return
+  } count() -> (index, index, index) {
+    %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+    flow.return %x, %y, %z : index, index, index
+  }
+  %3 = flow.dispatch.workgroups(%1) : (tensor<2xf32>) -> tensor<2xf32> =
+      (%arg2: !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>, %arg3: !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>) {
+    %6 = iree_tensor_ext.dispatch.tensor.load %arg2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+    %7 = tensor.empty() : tensor<2xf32>
+    %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%6 : tensor<2xf32>) outs(%7 : tensor<2xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %9 = math.absf %in : f32
+      linalg.yield %9 : f32
+    } -> tensor<2xf32>
+    iree_tensor_ext.dispatch.tensor.store %8, %arg3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+    flow.return
+  } count() -> (index, index, index) {
+    %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+    flow.return %x, %y, %z : index, index, index
+  }
+  %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %4, %5 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After CanonicalizePass (iree-flow-canonicalize) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %2 = flow.dispatch.workgroups(%0) : (tensor<2xf32>) -> tensor<2xf32> =
+      (%arg2: !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>, %arg3: !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>) {
+    %6 = iree_tensor_ext.dispatch.tensor.load %arg2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+    %7 = tensor.empty() : tensor<2xf32>
+    %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%6 : tensor<2xf32>) outs(%7 : tensor<2xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %9 = math.absf %in : f32
+      linalg.yield %9 : f32
+    } -> tensor<2xf32>
+    iree_tensor_ext.dispatch.tensor.store %8, %arg3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+    flow.return
+  } count() -> (index, index, index) {
+    %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+    flow.return %x, %y, %z : index, index, index
+  }
+  %3 = flow.dispatch.workgroups(%1) : (tensor<2xf32>) -> tensor<2xf32> =
+      (%arg2: !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>, %arg3: !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>) {
+    %6 = iree_tensor_ext.dispatch.tensor.load %arg2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+    %7 = tensor.empty() : tensor<2xf32>
+    %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%6 : tensor<2xf32>) outs(%7 : tensor<2xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %9 = math.absf %in : f32
+      linalg.yield %9 : f32
+    } -> tensor<2xf32>
+    iree_tensor_ext.dispatch.tensor.store %8, %arg3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+    flow.return
+  } count() -> (index, index, index) {
+    %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+    flow.return %x, %y, %z : index, index, index
+  }
+  %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %4, %5 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After CSE (cse) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %2 = flow.dispatch.workgroups(%0) : (tensor<2xf32>) -> tensor<2xf32> =
+      (%arg2: !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>, %arg3: !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>) {
+    %6 = iree_tensor_ext.dispatch.tensor.load %arg2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+    %7 = tensor.empty() : tensor<2xf32>
+    %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%6 : tensor<2xf32>) outs(%7 : tensor<2xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %9 = math.absf %in : f32
+      linalg.yield %9 : f32
+    } -> tensor<2xf32>
+    iree_tensor_ext.dispatch.tensor.store %8, %arg3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+    flow.return
+  } count() -> (index, index, index) {
+    %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+    flow.return %x, %y, %z : index, index, index
+  }
+  %3 = flow.dispatch.workgroups(%1) : (tensor<2xf32>) -> tensor<2xf32> =
+      (%arg2: !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>, %arg3: !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>) {
+    %6 = iree_tensor_ext.dispatch.tensor.load %arg2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+    %7 = tensor.empty() : tensor<2xf32>
+    %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%6 : tensor<2xf32>) outs(%7 : tensor<2xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %9 = math.absf %in : f32
+      linalg.yield %9 : f32
+    } -> tensor<2xf32>
+    iree_tensor_ext.dispatch.tensor.store %8, %arg3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+    flow.return
+  } count() -> (index, index, index) {
+    %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+    flow.return %x, %y, %z : index, index, index
+  }
+  %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %4, %5 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After OutlineDispatchExternsPass (iree-flow-outline-dispatch-externs) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+    %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+    %2 = flow.dispatch.workgroups(%0) : (tensor<2xf32>) -> tensor<2xf32> =
+        (%arg2: !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>, %arg3: !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>) {
+      %6 = iree_tensor_ext.dispatch.tensor.load %arg2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+      %7 = tensor.empty() : tensor<2xf32>
+      %8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%6 : tensor<2xf32>) outs(%7 : tensor<2xf32>) {
+      ^bb0(%in: f32, %out: f32):
+        %9 = math.absf %in : f32
+        linalg.yield %9 : f32
+      } -> tensor<2xf32>
+      iree_tensor_ext.dispatch.tensor.store %8, %arg3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+      flow.return
+    } count() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      flow.return %x, %y, %z : index, index, index
+    }
+    %3 = flow.dispatch.workgroups(%1) : (tensor<2xf32>) -> tensor<2xf32> =
+        (%arg2: !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>, %arg3: !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>) {
+      %6 = iree_tensor_ext.dispatch.tensor.load %arg2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+      %7 = tensor.empty() : tensor<2xf32>
+      %8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%6 : tensor<2xf32>) outs(%7 : tensor<2xf32>) {
+      ^bb0(%in: f32, %out: f32):
+        %9 = math.absf %in : f32
+        linalg.yield %9 : f32
+      } -> tensor<2xf32>
+      iree_tensor_ext.dispatch.tensor.store %8, %arg3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+      flow.return
+    } count() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      flow.return %x, %y, %z : index, index, index
+    }
+    %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view
+    %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view
+    util.return %4, %5 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After OutlineDispatchRegionsPass (iree-flow-outline-dispatch-regions) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  flow.executable private @multiple_results_dispatch_0 {
+    flow.executable.export public @multiple_results_dispatch_0 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      flow.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0(%arg0: !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>, %arg1: !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>) {
+        %0 = iree_tensor_ext.dispatch.tensor.load %arg0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %1 = tensor.empty() : tensor<2xf32>
+        %2 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%0 : tensor<2xf32>) outs(%1 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %3 = math.absf %in : f32
+          linalg.yield %3 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %2, %arg1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  flow.executable private @multiple_results_dispatch_1 {
+    flow.executable.export public @multiple_results_dispatch_1 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      flow.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_1(%arg0: !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>, %arg1: !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>) {
+        %0 = iree_tensor_ext.dispatch.tensor.load %arg0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %1 = tensor.empty() : tensor<2xf32>
+        %2 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%0 : tensor<2xf32>) outs(%1 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %3 = math.absf %in : f32
+          linalg.yield %3 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %2, %arg1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+    %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+    %2 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0(%0) : (tensor<2xf32>) -> tensor<2xf32>
+    %3 = flow.dispatch @multiple_results_dispatch_1::@multiple_results_dispatch_1(%1) : (tensor<2xf32>) -> tensor<2xf32>
+    %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view
+    %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view
+    util.return %4, %5 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After AnnotateDispatchesPass (iree-flow-annotate-dispatches) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  flow.executable private @multiple_results_dispatch_0 {
+    flow.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      flow.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>, %arg1: !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>) {
+        %0 = iree_tensor_ext.dispatch.tensor.load %arg0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %1 = tensor.empty() : tensor<2xf32>
+        %2 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%0 : tensor<2xf32>) outs(%1 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %3 = math.absf %in : f32
+          linalg.yield %3 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %2, %arg1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  flow.executable private @multiple_results_dispatch_1 {
+    flow.executable.export public @multiple_results_dispatch_1_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      flow.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_1_elementwise_2_f32(%arg0: !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>, %arg1: !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>) {
+        %0 = iree_tensor_ext.dispatch.tensor.load %arg0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %1 = tensor.empty() : tensor<2xf32>
+        %2 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%0 : tensor<2xf32>) outs(%1 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %3 = math.absf %in : f32
+          linalg.yield %3 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %2, %arg1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+    %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+    %2 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%0) : (tensor<2xf32>) -> tensor<2xf32>
+    %3 = flow.dispatch @multiple_results_dispatch_1::@multiple_results_dispatch_1_elementwise_2_f32(%1) : (tensor<2xf32>) -> tensor<2xf32>
+    %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view
+    %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view
+    util.return %4, %5 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After StripDebugOpsPass (iree-util-strip-debug-ops) //----- //
+flow.executable private @multiple_results_dispatch_1 {
+  flow.executable.export public @multiple_results_dispatch_1_elementwise_2_f32 workgroups() -> (index, index, index) {
+    %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+    flow.return %x, %y, %z : index, index, index
+  }
+  builtin.module {
+    func.func @multiple_results_dispatch_1_elementwise_2_f32(%arg0: !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>, %arg1: !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>) {
+      %0 = iree_tensor_ext.dispatch.tensor.load %arg0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+      %1 = tensor.empty() : tensor<2xf32>
+      %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<2xf32>) outs(%1 : tensor<2xf32>) {
+      ^bb0(%in: f32, %out: f32):
+        %3 = math.absf %in : f32
+        linalg.yield %3 : f32
+      } -> tensor<2xf32>
+      iree_tensor_ext.dispatch.tensor.store %2, %arg1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+      return
+    }
+  }
+}
+
+// -----// IR Dump After CanonicalizePass (iree-flow-canonicalize) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %2 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%0) : (tensor<2xf32>) -> tensor<2xf32>
+  %3 = flow.dispatch @multiple_results_dispatch_1::@multiple_results_dispatch_1_elementwise_2_f32(%1) : (tensor<2xf32>) -> tensor<2xf32>
+  %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %4, %5 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After StripDebugOpsPass (iree-util-strip-debug-ops) //----- //
+flow.executable private @multiple_results_dispatch_0 {
+  flow.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+    %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+    flow.return %x, %y, %z : index, index, index
+  }
+  builtin.module {
+    func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>, %arg1: !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>) {
+      %0 = iree_tensor_ext.dispatch.tensor.load %arg0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+      %1 = tensor.empty() : tensor<2xf32>
+      %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<2xf32>) outs(%1 : tensor<2xf32>) {
+      ^bb0(%in: f32, %out: f32):
+        %3 = math.absf %in : f32
+        linalg.yield %3 : f32
+      } -> tensor<2xf32>
+      iree_tensor_ext.dispatch.tensor.store %2, %arg1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+      return
+    }
+  }
+}
+
+// -----// IR Dump After DeduplicateExecutablesPass (iree-flow-deduplicate-executables) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  flow.executable private @multiple_results_dispatch_0 {
+    flow.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      flow.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>, %arg1: !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>) {
+        %0 = iree_tensor_ext.dispatch.tensor.load %arg0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %1 = tensor.empty() : tensor<2xf32>
+        %2 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%0 : tensor<2xf32>) outs(%1 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %3 = math.absf %in : f32
+          linalg.yield %3 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %2, %arg1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+    %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+    %2 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%0) : (tensor<2xf32>) -> tensor<2xf32>
+    %3 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1) : (tensor<2xf32>) -> tensor<2xf32>
+    %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view
+    %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view
+    util.return %4, %5 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After InjectTensorTracingPass (iree-flow-inject-tensor-tracing) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %2 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%0) : (tensor<2xf32>) -> tensor<2xf32>
+  %3 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1) : (tensor<2xf32>) -> tensor<2xf32>
+  %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %4, %5 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After CleanupTensorShapesPass (iree-flow-cleanup-tensor-shapes) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %2 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%0) : (tensor<2xf32>) -> tensor<2xf32>
+  %3 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1) : (tensor<2xf32>) -> tensor<2xf32>
+  %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %4, %5 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After OutlineConstantsPass (iree-flow-outline-constants) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  flow.executable private @multiple_results_dispatch_0 {
+    flow.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      flow.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>, %arg1: !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>) {
+        %0 = iree_tensor_ext.dispatch.tensor.load %arg0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %1 = tensor.empty() : tensor<2xf32>
+        %2 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%0 : tensor<2xf32>) outs(%1 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %3 = math.absf %in : f32
+          linalg.yield %3 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %2, %arg1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+    %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+    %2 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%0) : (tensor<2xf32>) -> tensor<2xf32>
+    %3 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1) : (tensor<2xf32>) -> tensor<2xf32>
+    %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view
+    %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view
+    util.return %4, %5 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %2 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%0) : (tensor<2xf32>) -> tensor<2xf32>
+  %3 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1) : (tensor<2xf32>) -> tensor<2xf32>
+  %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %4, %5 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After CanonicalizePass (iree-flow-canonicalize) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %2 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%0) : (tensor<2xf32>) -> tensor<2xf32>
+  %3 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1) : (tensor<2xf32>) -> tensor<2xf32>
+  %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %4, %5 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After CSE (cse) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %2 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%0) : (tensor<2xf32>) -> tensor<2xf32>
+  %3 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1) : (tensor<2xf32>) -> tensor<2xf32>
+  %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %4, %5 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %2 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%0) : (tensor<2xf32>) -> tensor<2xf32>
+  %3 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1) : (tensor<2xf32>) -> tensor<2xf32>
+  %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %4, %5 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %2 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%0) : (tensor<2xf32>) -> tensor<2xf32>
+  %3 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1) : (tensor<2xf32>) -> tensor<2xf32>
+  %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %4, %5 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  flow.executable private @multiple_results_dispatch_0 {
+    flow.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      flow.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>, %arg1: !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>) {
+        %0 = iree_tensor_ext.dispatch.tensor.load %arg0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %1 = tensor.empty() : tensor<2xf32>
+        %2 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%0 : tensor<2xf32>) outs(%1 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %3 = math.absf %in : f32
+          linalg.yield %3 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %2, %arg1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+    %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+    %2 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%0) : (tensor<2xf32>) -> tensor<2xf32>
+    %3 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1) : (tensor<2xf32>) -> tensor<2xf32>
+    %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view
+    %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view
+    util.return %4, %5 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  flow.executable private @multiple_results_dispatch_0 {
+    flow.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      flow.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>, %arg1: !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>) {
+        %0 = iree_tensor_ext.dispatch.tensor.load %arg0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %1 = tensor.empty() : tensor<2xf32>
+        %2 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%0 : tensor<2xf32>) outs(%1 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %3 = math.absf %in : f32
+          linalg.yield %3 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %2, %arg1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+    %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+    %2 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%0) : (tensor<2xf32>) -> tensor<2xf32>
+    %3 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1) : (tensor<2xf32>) -> tensor<2xf32>
+    %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view
+    %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view
+    util.return %4, %5 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After IPOPass (iree-util-ipo) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  flow.executable private @multiple_results_dispatch_0 {
+    flow.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      flow.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>, %arg1: !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>) {
+        %0 = iree_tensor_ext.dispatch.tensor.load %arg0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %1 = tensor.empty() : tensor<2xf32>
+        %2 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%0 : tensor<2xf32>) outs(%1 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %3 = math.absf %in : f32
+          linalg.yield %3 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %2, %arg1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+    %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+    %2 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%0) : (tensor<2xf32>) -> tensor<2xf32>
+    %3 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1) : (tensor<2xf32>) -> tensor<2xf32>
+    %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view
+    %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view
+    util.return %4, %5 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After FixedPointIteratorPass (iree-util-fixed-point-iterator) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  flow.executable private @multiple_results_dispatch_0 {
+    flow.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      flow.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>, %arg1: !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>) {
+        %0 = iree_tensor_ext.dispatch.tensor.load %arg0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %1 = tensor.empty() : tensor<2xf32>
+        %2 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%0 : tensor<2xf32>) outs(%1 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %3 = math.absf %in : f32
+          linalg.yield %3 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %2, %arg1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+    %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+    %2 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%0) : (tensor<2xf32>) -> tensor<2xf32>
+    %3 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1) : (tensor<2xf32>) -> tensor<2xf32>
+    %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view
+    %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view
+    util.return %4, %5 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After SymbolDCE (symbol-dce) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  flow.executable private @multiple_results_dispatch_0 {
+    flow.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      flow.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>, %arg1: !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>) {
+        %0 = iree_tensor_ext.dispatch.tensor.load %arg0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %1 = tensor.empty() : tensor<2xf32>
+        %2 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%0 : tensor<2xf32>) outs(%1 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %3 = math.absf %in : f32
+          linalg.yield %3 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %2, %arg1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+    %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+    %2 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%0) : (tensor<2xf32>) -> tensor<2xf32>
+    %3 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1) : (tensor<2xf32>) -> tensor<2xf32>
+    %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view
+    %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view
+    util.return %4, %5 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After VerifyInitializationOrderPass (iree-util-verify-initialization-order) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  flow.executable private @multiple_results_dispatch_0 {
+    flow.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      flow.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>, %arg1: !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>) {
+        %0 = iree_tensor_ext.dispatch.tensor.load %arg0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %1 = tensor.empty() : tensor<2xf32>
+        %2 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%0 : tensor<2xf32>) outs(%1 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %3 = math.absf %in : f32
+          linalg.yield %3 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %2, %arg1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+    %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+    %2 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%0) : (tensor<2xf32>) -> tensor<2xf32>
+    %3 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1) : (tensor<2xf32>) -> tensor<2xf32>
+    %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view
+    %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view
+    util.return %4, %5 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After VerifyInputPass (iree-stream-verify-input) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  flow.executable private @multiple_results_dispatch_0 {
+    flow.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      flow.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>, %arg1: !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>) {
+        %0 = iree_tensor_ext.dispatch.tensor.load %arg0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %1 = tensor.empty() : tensor<2xf32>
+        %2 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%0 : tensor<2xf32>) outs(%1 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %3 = math.absf %in : f32
+          linalg.yield %3 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %2, %arg1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+    %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+    %2 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%0) : (tensor<2xf32>) -> tensor<2xf32>
+    %3 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1) : (tensor<2xf32>) -> tensor<2xf32>
+    %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view
+    %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view
+    util.return %4, %5 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %2 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%0) : (tensor<2xf32>) -> tensor<2xf32>
+  %3 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1) : (tensor<2xf32>) -> tensor<2xf32>
+  %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %4, %5 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After CSE (cse) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %2 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%0) : (tensor<2xf32>) -> tensor<2xf32>
+  %3 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1) : (tensor<2xf32>) -> tensor<2xf32>
+  %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %4, %5 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %2 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%0) : (tensor<2xf32>) -> tensor<2xf32>
+  %3 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1) : (tensor<2xf32>) -> tensor<2xf32>
+  %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %4, %5 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %2 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%0) : (tensor<2xf32>) -> tensor<2xf32>
+  %3 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1) : (tensor<2xf32>) -> tensor<2xf32>
+  %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %4, %5 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+  %2 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%0) : (tensor<2xf32>) -> tensor<2xf32>
+  %3 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1) : (tensor<2xf32>) -> tensor<2xf32>
+  %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view
+  %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view
+  util.return %4, %5 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  flow.executable private @multiple_results_dispatch_0 {
+    flow.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      flow.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>, %arg1: !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>) {
+        %0 = iree_tensor_ext.dispatch.tensor.load %arg0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %1 = tensor.empty() : tensor<2xf32>
+        %2 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%0 : tensor<2xf32>) outs(%1 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %3 = math.absf %in : f32
+          linalg.yield %3 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %2, %arg1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+    %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+    %2 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%0) : (tensor<2xf32>) -> tensor<2xf32>
+    %3 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1) : (tensor<2xf32>) -> tensor<2xf32>
+    %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view
+    %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view
+    util.return %4, %5 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  flow.executable private @multiple_results_dispatch_0 {
+    flow.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      flow.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>, %arg1: !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>) {
+        %0 = iree_tensor_ext.dispatch.tensor.load %arg0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %1 = tensor.empty() : tensor<2xf32>
+        %2 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%0 : tensor<2xf32>) outs(%1 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %3 = math.absf %in : f32
+          linalg.yield %3 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %2, %arg1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+    %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+    %2 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%0) : (tensor<2xf32>) -> tensor<2xf32>
+    %3 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1) : (tensor<2xf32>) -> tensor<2xf32>
+    %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view
+    %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view
+    util.return %4, %5 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After IPOPass (iree-util-ipo) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  flow.executable private @multiple_results_dispatch_0 {
+    flow.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      flow.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>, %arg1: !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>) {
+        %0 = iree_tensor_ext.dispatch.tensor.load %arg0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %1 = tensor.empty() : tensor<2xf32>
+        %2 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%0 : tensor<2xf32>) outs(%1 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %3 = math.absf %in : f32
+          linalg.yield %3 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %2, %arg1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+    %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+    %2 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%0) : (tensor<2xf32>) -> tensor<2xf32>
+    %3 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1) : (tensor<2xf32>) -> tensor<2xf32>
+    %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view
+    %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view
+    util.return %4, %5 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After CloneToConsumersPass (iree-stream-clone-to-consumers) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  flow.executable private @multiple_results_dispatch_0 {
+    flow.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      flow.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>, %arg1: !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>) {
+        %0 = iree_tensor_ext.dispatch.tensor.load %arg0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %1 = tensor.empty() : tensor<2xf32>
+        %2 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%0 : tensor<2xf32>) outs(%1 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %3 = math.absf %in : f32
+          linalg.yield %3 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %2, %arg1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32>
+    %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32>
+    %2 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%0) : (tensor<2xf32>) -> tensor<2xf32>
+    %3 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1) : (tensor<2xf32>) -> tensor<2xf32>
+    %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view
+    %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view
+    util.return %4, %5 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After ConvertToStreamPass (iree-stream-conversion) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  stream.executable private @multiple_results_dispatch_0 {
+    stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      stream.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) {
+        %c0 = arith.constant 0 : index
+        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %3 = tensor.empty() : tensor<2xf32>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %5 = math.absf %in : f32
+          linalg.yield %5 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    %c2 = arith.constant 2 : index
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xf32> : index
+    %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%0}
+    %2 = stream.async.transfer %1 : !stream.resource<external>{%0} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%0}
+    %element_type_f32_0 = hal.element_type<f32> : i32
+    %dense_row_major_1 = hal.encoding_type<dense_row_major> : i32
+    %c2_2 = arith.constant 2 : index
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2_2]) type(%element_type_f32_0) encoding(%dense_row_major_1)
+    %3 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xf32> : index
+    %4 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%3}
+    %5 = stream.async.transfer %4 : !stream.resource<external>{%3} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%3}
+    %6 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xf32> : index
+    %7 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%2) : (tensor<2xf32> in !stream.resource<*>{%0}) -> tensor<2xf32> in !stream.resource<*>{%6}
+    %8 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xf32> : index
+    %9 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%5) : (tensor<2xf32> in !stream.resource<*>{%3}) -> tensor<2xf32> in !stream.resource<*>{%8}
+    %10 = stream.async.transfer %7 : !stream.resource<*>{%6} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%6}
+    %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2xf32> in !stream.resource<external>{%6} -> !hal.buffer_view
+    %12 = stream.async.transfer %9 : !stream.resource<*>{%8} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%8}
+    %13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12 : tensor<2xf32> in !stream.resource<external>{%8} -> !hal.buffer_view
+    util.return %11, %13 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After VerifyLoweringToTensorsPass (iree-stream-verify-lowering-to-tensors) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  stream.executable private @multiple_results_dispatch_0 {
+    stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      stream.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) {
+        %c0 = arith.constant 0 : index
+        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %3 = tensor.empty() : tensor<2xf32>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %5 = math.absf %in : f32
+          linalg.yield %5 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    %c2 = arith.constant 2 : index
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xf32> : index
+    %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%0}
+    %2 = stream.async.transfer %1 : !stream.resource<external>{%0} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%0}
+    %element_type_f32_0 = hal.element_type<f32> : i32
+    %dense_row_major_1 = hal.encoding_type<dense_row_major> : i32
+    %c2_2 = arith.constant 2 : index
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2_2]) type(%element_type_f32_0) encoding(%dense_row_major_1)
+    %3 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xf32> : index
+    %4 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%3}
+    %5 = stream.async.transfer %4 : !stream.resource<external>{%3} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%3}
+    %6 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xf32> : index
+    %7 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%2) : (tensor<2xf32> in !stream.resource<*>{%0}) -> tensor<2xf32> in !stream.resource<*>{%6}
+    %8 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xf32> : index
+    %9 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%5) : (tensor<2xf32> in !stream.resource<*>{%3}) -> tensor<2xf32> in !stream.resource<*>{%8}
+    %10 = stream.async.transfer %7 : !stream.resource<*>{%6} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%6}
+    %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2xf32> in !stream.resource<external>{%6} -> !hal.buffer_view
+    %12 = stream.async.transfer %9 : !stream.resource<*>{%8} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%8}
+    %13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12 : tensor<2xf32> in !stream.resource<external>{%8} -> !hal.buffer_view
+    util.return %11, %13 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) {
+  %c0 = arith.constant 0 : index
+  %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+  %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+  %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+  %3 = tensor.empty() : tensor<2xf32>
+  %4 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %5 = math.absf %in : f32
+    linalg.yield %5 : f32
+  } -> tensor<2xf32>
+  iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+  return
+}
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c2 = arith.constant 2 : index
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xf32> : index
+  %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%0}
+  %2 = stream.async.clone on(#hal.device.affinity<@__device_0>) %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
+  %element_type_f32_0 = hal.element_type<f32> : i32
+  %dense_row_major_1 = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32_0) encoding(%dense_row_major_1)
+  %3 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xf32> : index
+  %4 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%3}
+  %5 = stream.async.clone on(#hal.device.affinity<@__device_0>) %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3}
+  %6 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xf32> : index
+  %7 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%2) : (tensor<2xf32> in !stream.resource<*>{%0}) -> tensor<2xf32> in !stream.resource<*>{%6}
+  %8 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xf32> : index
+  %9 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%5) : (tensor<2xf32> in !stream.resource<*>{%3}) -> tensor<2xf32> in !stream.resource<*>{%8}
+  %10 = stream.async.clone on(#hal.device.affinity<@__device_0>) %7 : !stream.resource<*>{%6} -> !stream.resource<external>{%6}
+  %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2xf32> in !stream.resource<external>{%6} -> !hal.buffer_view
+  %12 = stream.async.clone on(#hal.device.affinity<@__device_0>) %9 : !stream.resource<*>{%8} -> !stream.resource<external>{%8}
+  %13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12 : tensor<2xf32> in !stream.resource<external>{%8} -> !hal.buffer_view
+  util.return %11, %13 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After Inliner (inline) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  stream.executable private @multiple_results_dispatch_0 {
+    stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      stream.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) {
+        %c0 = arith.constant 0 : index
+        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %3 = tensor.empty() : tensor<2xf32>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %5 = math.absf %in : f32
+          linalg.yield %5 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c2 = arith.constant 2 : index
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xf32> : index
+    %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%0}
+    %2 = stream.async.clone on(#hal.device.affinity<@__device_0>) %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
+    %element_type_f32_0 = hal.element_type<f32> : i32
+    %dense_row_major_1 = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32_0) encoding(%dense_row_major_1)
+    %3 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xf32> : index
+    %4 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%3}
+    %5 = stream.async.clone on(#hal.device.affinity<@__device_0>) %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3}
+    %6 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xf32> : index
+    %7 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%2) : (tensor<2xf32> in !stream.resource<*>{%0}) -> tensor<2xf32> in !stream.resource<*>{%6}
+    %8 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xf32> : index
+    %9 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%5) : (tensor<2xf32> in !stream.resource<*>{%3}) -> tensor<2xf32> in !stream.resource<*>{%8}
+    %10 = stream.async.clone on(#hal.device.affinity<@__device_0>) %7 : !stream.resource<*>{%6} -> !stream.resource<external>{%6}
+    %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2xf32> in !stream.resource<external>{%6} -> !hal.buffer_view
+    %12 = stream.async.clone on(#hal.device.affinity<@__device_0>) %9 : !stream.resource<*>{%8} -> !stream.resource<external>{%8}
+    %13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12 : tensor<2xf32> in !stream.resource<external>{%8} -> !hal.buffer_view
+    util.return %11, %13 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c2 = arith.constant 2 : index
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xf32> : index
+  %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%0}
+  %2 = stream.async.clone on(#hal.device.affinity<@__device_0>) %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
+  %element_type_f32_0 = hal.element_type<f32> : i32
+  %dense_row_major_1 = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32_0) encoding(%dense_row_major_1)
+  %3 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xf32> : index
+  %4 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%3}
+  %5 = stream.async.clone on(#hal.device.affinity<@__device_0>) %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3}
+  %6 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xf32> : index
+  %7 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%2) : (tensor<2xf32> in !stream.resource<*>{%0}) -> tensor<2xf32> in !stream.resource<*>{%6}
+  %8 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xf32> : index
+  %9 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%5) : (tensor<2xf32> in !stream.resource<*>{%3}) -> tensor<2xf32> in !stream.resource<*>{%8}
+  %10 = stream.async.clone on(#hal.device.affinity<@__device_0>) %7 : !stream.resource<*>{%6} -> !stream.resource<external>{%6}
+  %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2xf32> in !stream.resource<external>{%6} -> !hal.buffer_view
+  %12 = stream.async.clone on(#hal.device.affinity<@__device_0>) %9 : !stream.resource<*>{%8} -> !stream.resource<external>{%8}
+  %13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12 : tensor<2xf32> in !stream.resource<external>{%8} -> !hal.buffer_view
+  util.return %11, %13 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After CSE (cse) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c2 = arith.constant 2 : index
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xf32> : index
+  %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%0}
+  %2 = stream.async.clone on(#hal.device.affinity<@__device_0>) %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %3 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%0}
+  %4 = stream.async.clone on(#hal.device.affinity<@__device_0>) %3 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
+  %5 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%2) : (tensor<2xf32> in !stream.resource<*>{%0}) -> tensor<2xf32> in !stream.resource<*>{%0}
+  %6 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%4) : (tensor<2xf32> in !stream.resource<*>{%0}) -> tensor<2xf32> in !stream.resource<*>{%0}
+  %7 = stream.async.clone on(#hal.device.affinity<@__device_0>) %5 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
+  %8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<2xf32> in !stream.resource<external>{%0} -> !hal.buffer_view
+  %9 = stream.async.clone on(#hal.device.affinity<@__device_0>) %6 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
+  %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2xf32> in !stream.resource<external>{%0} -> !hal.buffer_view
+  util.return %8, %10 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c2 = arith.constant 2 : index
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xf32> : index
+  %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%0}
+  %2 = stream.async.clone on(#hal.device.affinity<@__device_0>) %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %3 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%0}
+  %4 = stream.async.clone on(#hal.device.affinity<@__device_0>) %3 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
+  %5 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%2) : (tensor<2xf32> in !stream.resource<*>{%0}) -> tensor<2xf32> in !stream.resource<*>{%0}
+  %6 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%4) : (tensor<2xf32> in !stream.resource<*>{%0}) -> tensor<2xf32> in !stream.resource<*>{%0}
+  %7 = stream.async.clone on(#hal.device.affinity<@__device_0>) %5 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
+  %8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<2xf32> in !stream.resource<external>{%0} -> !hal.buffer_view
+  %9 = stream.async.clone on(#hal.device.affinity<@__device_0>) %6 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
+  %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2xf32> in !stream.resource<external>{%0} -> !hal.buffer_view
+  util.return %8, %10 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c2 = arith.constant 2 : index
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xf32> : index
+  %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%0}
+  %2 = stream.async.clone on(#hal.device.affinity<@__device_0>) %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %3 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%0}
+  %4 = stream.async.clone on(#hal.device.affinity<@__device_0>) %3 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
+  %5 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%2) : (tensor<2xf32> in !stream.resource<*>{%0}) -> tensor<2xf32> in !stream.resource<*>{%0}
+  %6 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%4) : (tensor<2xf32> in !stream.resource<*>{%0}) -> tensor<2xf32> in !stream.resource<*>{%0}
+  %7 = stream.async.clone on(#hal.device.affinity<@__device_0>) %5 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
+  %8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<2xf32> in !stream.resource<external>{%0} -> !hal.buffer_view
+  %9 = stream.async.clone on(#hal.device.affinity<@__device_0>) %6 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
+  %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2xf32> in !stream.resource<external>{%0} -> !hal.buffer_view
+  util.return %8, %10 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c2 = arith.constant 2 : index
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xf32> : index
+  %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%0}
+  %2 = stream.async.clone on(#hal.device.affinity<@__device_0>) %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %3 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%0}
+  %4 = stream.async.clone on(#hal.device.affinity<@__device_0>) %3 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
+  %5 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%2) : (tensor<2xf32> in !stream.resource<*>{%0}) -> tensor<2xf32> in !stream.resource<*>{%0}
+  %6 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%4) : (tensor<2xf32> in !stream.resource<*>{%0}) -> tensor<2xf32> in !stream.resource<*>{%0}
+  %7 = stream.async.clone on(#hal.device.affinity<@__device_0>) %5 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
+  %8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<2xf32> in !stream.resource<external>{%0} -> !hal.buffer_view
+  %9 = stream.async.clone on(#hal.device.affinity<@__device_0>) %6 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
+  %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2xf32> in !stream.resource<external>{%0} -> !hal.buffer_view
+  util.return %8, %10 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  stream.executable private @multiple_results_dispatch_0 {
+    stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      stream.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) {
+        %c0 = arith.constant 0 : index
+        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %3 = tensor.empty() : tensor<2xf32>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %5 = math.absf %in : f32
+          linalg.yield %5 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c2 = arith.constant 2 : index
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xf32> : index
+    %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%0}
+    %2 = stream.async.clone on(#hal.device.affinity<@__device_0>) %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %3 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%0}
+    %4 = stream.async.clone on(#hal.device.affinity<@__device_0>) %3 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
+    %5 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%2) : (tensor<2xf32> in !stream.resource<*>{%0}) -> tensor<2xf32> in !stream.resource<*>{%0}
+    %6 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%4) : (tensor<2xf32> in !stream.resource<*>{%0}) -> tensor<2xf32> in !stream.resource<*>{%0}
+    %7 = stream.async.clone on(#hal.device.affinity<@__device_0>) %5 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
+    %8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<2xf32> in !stream.resource<external>{%0} -> !hal.buffer_view
+    %9 = stream.async.clone on(#hal.device.affinity<@__device_0>) %6 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
+    %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2xf32> in !stream.resource<external>{%0} -> !hal.buffer_view
+    util.return %8, %10 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  stream.executable private @multiple_results_dispatch_0 {
+    stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      stream.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) {
+        %c0 = arith.constant 0 : index
+        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %3 = tensor.empty() : tensor<2xf32>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %5 = math.absf %in : f32
+          linalg.yield %5 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c2 = arith.constant 2 : index
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xf32> : index
+    %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%0}
+    %2 = stream.async.clone on(#hal.device.affinity<@__device_0>) %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %3 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%0}
+    %4 = stream.async.clone on(#hal.device.affinity<@__device_0>) %3 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
+    %5 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%2) : (tensor<2xf32> in !stream.resource<*>{%0}) -> tensor<2xf32> in !stream.resource<*>{%0}
+    %6 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%4) : (tensor<2xf32> in !stream.resource<*>{%0}) -> tensor<2xf32> in !stream.resource<*>{%0}
+    %7 = stream.async.clone on(#hal.device.affinity<@__device_0>) %5 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
+    %8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<2xf32> in !stream.resource<external>{%0} -> !hal.buffer_view
+    %9 = stream.async.clone on(#hal.device.affinity<@__device_0>) %6 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
+    %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2xf32> in !stream.resource<external>{%0} -> !hal.buffer_view
+    util.return %8, %10 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After IPOPass (iree-util-ipo) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  stream.executable private @multiple_results_dispatch_0 {
+    stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      stream.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) {
+        %c0 = arith.constant 0 : index
+        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %3 = tensor.empty() : tensor<2xf32>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %5 = math.absf %in : f32
+          linalg.yield %5 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c2 = arith.constant 2 : index
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xf32> : index
+    %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%0}
+    %2 = stream.async.clone on(#hal.device.affinity<@__device_0>) %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %3 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%0}
+    %4 = stream.async.clone on(#hal.device.affinity<@__device_0>) %3 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
+    %5 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%2) : (tensor<2xf32> in !stream.resource<*>{%0}) -> tensor<2xf32> in !stream.resource<*>{%0}
+    %6 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%4) : (tensor<2xf32> in !stream.resource<*>{%0}) -> tensor<2xf32> in !stream.resource<*>{%0}
+    %7 = stream.async.clone on(#hal.device.affinity<@__device_0>) %5 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
+    %8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<2xf32> in !stream.resource<external>{%0} -> !hal.buffer_view
+    %9 = stream.async.clone on(#hal.device.affinity<@__device_0>) %6 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
+    %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2xf32> in !stream.resource<external>{%0} -> !hal.buffer_view
+    util.return %8, %10 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After CombineInitializersPass (iree-util-combine-initializers) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  stream.executable private @multiple_results_dispatch_0 {
+    stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      stream.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) {
+        %c0 = arith.constant 0 : index
+        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %3 = tensor.empty() : tensor<2xf32>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %5 = math.absf %in : f32
+          linalg.yield %5 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c2 = arith.constant 2 : index
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xf32> : index
+    %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%0}
+    %2 = stream.async.clone on(#hal.device.affinity<@__device_0>) %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %3 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%0}
+    %4 = stream.async.clone on(#hal.device.affinity<@__device_0>) %3 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
+    %5 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%2) : (tensor<2xf32> in !stream.resource<*>{%0}) -> tensor<2xf32> in !stream.resource<*>{%0}
+    %6 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%4) : (tensor<2xf32> in !stream.resource<*>{%0}) -> tensor<2xf32> in !stream.resource<*>{%0}
+    %7 = stream.async.clone on(#hal.device.affinity<@__device_0>) %5 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
+    %8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<2xf32> in !stream.resource<external>{%0} -> !hal.buffer_view
+    %9 = stream.async.clone on(#hal.device.affinity<@__device_0>) %6 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
+    %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2xf32> in !stream.resource<external>{%0} -> !hal.buffer_view
+    util.return %8, %10 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c2 = arith.constant 2 : index
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xf32> : index
+  %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%0}
+  %2 = stream.async.clone on(#hal.device.affinity<@__device_0>) %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %3 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%0}
+  %4 = stream.async.clone on(#hal.device.affinity<@__device_0>) %3 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
+  %5 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%2) : (tensor<2xf32> in !stream.resource<*>{%0}) -> tensor<2xf32> in !stream.resource<*>{%0}
+  %6 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%4) : (tensor<2xf32> in !stream.resource<*>{%0}) -> tensor<2xf32> in !stream.resource<*>{%0}
+  %7 = stream.async.clone on(#hal.device.affinity<@__device_0>) %5 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
+  %8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<2xf32> in !stream.resource<external>{%0} -> !hal.buffer_view
+  %9 = stream.async.clone on(#hal.device.affinity<@__device_0>) %6 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
+  %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2xf32> in !stream.resource<external>{%0} -> !hal.buffer_view
+  util.return %8, %10 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After CSE (cse) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c2 = arith.constant 2 : index
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xf32> : index
+  %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%0}
+  %2 = stream.async.clone on(#hal.device.affinity<@__device_0>) %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %3 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%0}
+  %4 = stream.async.clone on(#hal.device.affinity<@__device_0>) %3 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
+  %5 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%2) : (tensor<2xf32> in !stream.resource<*>{%0}) -> tensor<2xf32> in !stream.resource<*>{%0}
+  %6 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%4) : (tensor<2xf32> in !stream.resource<*>{%0}) -> tensor<2xf32> in !stream.resource<*>{%0}
+  %7 = stream.async.clone on(#hal.device.affinity<@__device_0>) %5 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
+  %8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<2xf32> in !stream.resource<external>{%0} -> !hal.buffer_view
+  %9 = stream.async.clone on(#hal.device.affinity<@__device_0>) %6 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
+  %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2xf32> in !stream.resource<external>{%0} -> !hal.buffer_view
+  util.return %8, %10 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c2 = arith.constant 2 : index
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xf32> : index
+  %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%0}
+  %2 = stream.async.clone on(#hal.device.affinity<@__device_0>) %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %3 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%0}
+  %4 = stream.async.clone on(#hal.device.affinity<@__device_0>) %3 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
+  %5 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%2) : (tensor<2xf32> in !stream.resource<*>{%0}) -> tensor<2xf32> in !stream.resource<*>{%0}
+  %6 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%4) : (tensor<2xf32> in !stream.resource<*>{%0}) -> tensor<2xf32> in !stream.resource<*>{%0}
+  %7 = stream.async.clone on(#hal.device.affinity<@__device_0>) %5 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
+  %8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<2xf32> in !stream.resource<external>{%0} -> !hal.buffer_view
+  %9 = stream.async.clone on(#hal.device.affinity<@__device_0>) %6 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
+  %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2xf32> in !stream.resource<external>{%0} -> !hal.buffer_view
+  util.return %8, %10 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c2 = arith.constant 2 : index
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xf32> : index
+  %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%0}
+  %2 = stream.async.clone on(#hal.device.affinity<@__device_0>) %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %3 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%0}
+  %4 = stream.async.clone on(#hal.device.affinity<@__device_0>) %3 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
+  %5 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%2) : (tensor<2xf32> in !stream.resource<*>{%0}) -> tensor<2xf32> in !stream.resource<*>{%0}
+  %6 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%4) : (tensor<2xf32> in !stream.resource<*>{%0}) -> tensor<2xf32> in !stream.resource<*>{%0}
+  %7 = stream.async.clone on(#hal.device.affinity<@__device_0>) %5 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
+  %8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<2xf32> in !stream.resource<external>{%0} -> !hal.buffer_view
+  %9 = stream.async.clone on(#hal.device.affinity<@__device_0>) %6 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
+  %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2xf32> in !stream.resource<external>{%0} -> !hal.buffer_view
+  util.return %8, %10 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c2 = arith.constant 2 : index
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xf32> : index
+  %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%0}
+  %2 = stream.async.clone on(#hal.device.affinity<@__device_0>) %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %3 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%0}
+  %4 = stream.async.clone on(#hal.device.affinity<@__device_0>) %3 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
+  %5 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%2) : (tensor<2xf32> in !stream.resource<*>{%0}) -> tensor<2xf32> in !stream.resource<*>{%0}
+  %6 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%4) : (tensor<2xf32> in !stream.resource<*>{%0}) -> tensor<2xf32> in !stream.resource<*>{%0}
+  %7 = stream.async.clone on(#hal.device.affinity<@__device_0>) %5 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
+  %8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<2xf32> in !stream.resource<external>{%0} -> !hal.buffer_view
+  %9 = stream.async.clone on(#hal.device.affinity<@__device_0>) %6 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
+  %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2xf32> in !stream.resource<external>{%0} -> !hal.buffer_view
+  util.return %8, %10 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  stream.executable private @multiple_results_dispatch_0 {
+    stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      stream.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) {
+        %c0 = arith.constant 0 : index
+        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %3 = tensor.empty() : tensor<2xf32>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %5 = math.absf %in : f32
+          linalg.yield %5 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c2 = arith.constant 2 : index
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xf32> : index
+    %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%0}
+    %2 = stream.async.clone on(#hal.device.affinity<@__device_0>) %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %3 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%0}
+    %4 = stream.async.clone on(#hal.device.affinity<@__device_0>) %3 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
+    %5 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%2) : (tensor<2xf32> in !stream.resource<*>{%0}) -> tensor<2xf32> in !stream.resource<*>{%0}
+    %6 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%4) : (tensor<2xf32> in !stream.resource<*>{%0}) -> tensor<2xf32> in !stream.resource<*>{%0}
+    %7 = stream.async.clone on(#hal.device.affinity<@__device_0>) %5 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
+    %8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<2xf32> in !stream.resource<external>{%0} -> !hal.buffer_view
+    %9 = stream.async.clone on(#hal.device.affinity<@__device_0>) %6 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
+    %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2xf32> in !stream.resource<external>{%0} -> !hal.buffer_view
+    util.return %8, %10 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  stream.executable private @multiple_results_dispatch_0 {
+    stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      stream.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) {
+        %c0 = arith.constant 0 : index
+        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %3 = tensor.empty() : tensor<2xf32>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %5 = math.absf %in : f32
+          linalg.yield %5 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c2 = arith.constant 2 : index
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xf32> : index
+    %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%0}
+    %2 = stream.async.clone on(#hal.device.affinity<@__device_0>) %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %3 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%0}
+    %4 = stream.async.clone on(#hal.device.affinity<@__device_0>) %3 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
+    %5 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%2) : (tensor<2xf32> in !stream.resource<*>{%0}) -> tensor<2xf32> in !stream.resource<*>{%0}
+    %6 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%4) : (tensor<2xf32> in !stream.resource<*>{%0}) -> tensor<2xf32> in !stream.resource<*>{%0}
+    %7 = stream.async.clone on(#hal.device.affinity<@__device_0>) %5 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
+    %8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<2xf32> in !stream.resource<external>{%0} -> !hal.buffer_view
+    %9 = stream.async.clone on(#hal.device.affinity<@__device_0>) %6 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
+    %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2xf32> in !stream.resource<external>{%0} -> !hal.buffer_view
+    util.return %8, %10 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After IPOPass (iree-util-ipo) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  stream.executable private @multiple_results_dispatch_0 {
+    stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      stream.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) {
+        %c0 = arith.constant 0 : index
+        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %3 = tensor.empty() : tensor<2xf32>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %5 = math.absf %in : f32
+          linalg.yield %5 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c2 = arith.constant 2 : index
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xf32> : index
+    %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%0}
+    %2 = stream.async.clone on(#hal.device.affinity<@__device_0>) %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %3 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%0}
+    %4 = stream.async.clone on(#hal.device.affinity<@__device_0>) %3 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
+    %5 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%2) : (tensor<2xf32> in !stream.resource<*>{%0}) -> tensor<2xf32> in !stream.resource<*>{%0}
+    %6 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%4) : (tensor<2xf32> in !stream.resource<*>{%0}) -> tensor<2xf32> in !stream.resource<*>{%0}
+    %7 = stream.async.clone on(#hal.device.affinity<@__device_0>) %5 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
+    %8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<2xf32> in !stream.resource<external>{%0} -> !hal.buffer_view
+    %9 = stream.async.clone on(#hal.device.affinity<@__device_0>) %6 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
+    %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2xf32> in !stream.resource<external>{%0} -> !hal.buffer_view
+    util.return %8, %10 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After FixedPointIteratorPass (iree-util-fixed-point-iterator) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  stream.executable private @multiple_results_dispatch_0 {
+    stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      stream.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) {
+        %c0 = arith.constant 0 : index
+        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %3 = tensor.empty() : tensor<2xf32>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %5 = math.absf %in : f32
+          linalg.yield %5 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c2 = arith.constant 2 : index
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xf32> : index
+    %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%0}
+    %2 = stream.async.clone on(#hal.device.affinity<@__device_0>) %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %3 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%0}
+    %4 = stream.async.clone on(#hal.device.affinity<@__device_0>) %3 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
+    %5 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%2) : (tensor<2xf32> in !stream.resource<*>{%0}) -> tensor<2xf32> in !stream.resource<*>{%0}
+    %6 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%4) : (tensor<2xf32> in !stream.resource<*>{%0}) -> tensor<2xf32> in !stream.resource<*>{%0}
+    %7 = stream.async.clone on(#hal.device.affinity<@__device_0>) %5 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
+    %8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<2xf32> in !stream.resource<external>{%0} -> !hal.buffer_view
+    %9 = stream.async.clone on(#hal.device.affinity<@__device_0>) %6 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
+    %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2xf32> in !stream.resource<external>{%0} -> !hal.buffer_view
+    util.return %8, %10 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After SpecializeEncodingsPass (iree-stream-specialize-encodings) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  stream.executable private @multiple_results_dispatch_0 {
+    stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      stream.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) {
+        %c0 = arith.constant 0 : index
+        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %3 = tensor.empty() : tensor<2xf32>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %5 = math.absf %in : f32
+          linalg.yield %5 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c2 = arith.constant 2 : index
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xf32> : index
+    %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%0}
+    %2 = stream.async.clone on(#hal.device.affinity<@__device_0>) %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %3 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%0}
+    %4 = stream.async.clone on(#hal.device.affinity<@__device_0>) %3 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
+    %5 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%2) : (tensor<2xf32> in !stream.resource<*>{%0}) -> tensor<2xf32> in !stream.resource<*>{%0}
+    %6 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%4) : (tensor<2xf32> in !stream.resource<*>{%0}) -> tensor<2xf32> in !stream.resource<*>{%0}
+    %7 = stream.async.clone on(#hal.device.affinity<@__device_0>) %5 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
+    %8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<2xf32> in !stream.resource<external>{%0} -> !hal.buffer_view
+    %9 = stream.async.clone on(#hal.device.affinity<@__device_0>) %6 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
+    %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2xf32> in !stream.resource<external>{%0} -> !hal.buffer_view
+    util.return %8, %10 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After EncodeDeviceTensorsPass (iree-stream-encode-device-tensors) //----- //
+stream.executable private @multiple_results_dispatch_0 {
+  stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+    %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+    stream.return %x, %y, %z : index, index, index
+  }
+  builtin.module {
+    func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) {
+      %c0 = arith.constant 0 : index
+      %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+      %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+      %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+      %3 = tensor.empty() : tensor<2xf32>
+      %4 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+      ^bb0(%in: f32, %out: f32):
+        %5 = math.absf %in : f32
+        linalg.yield %5 : f32
+      } -> tensor<2xf32>
+      iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+      return
+    }
+  }
+}
+
+// -----// IR Dump After EncodeHostTensorsPass (iree-stream-encode-host-tensors) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c0 = arith.constant 0 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  %1 = stream.async.clone on(#hal.device.affinity<@__device_0>) %0 : !stream.resource<external>{%c8} -> !stream.resource<*>{%c8}
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  %3 = stream.async.clone on(#hal.device.affinity<@__device_0>) %2 : !stream.resource<external>{%c8} -> !stream.resource<*>{%c8}
+  %4 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1[%c0 to %c8 for %c8]) : (!stream.resource<*>{%c8}) -> !stream.resource<*>{%c8}
+  %5 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%3[%c0 to %c8 for %c8]) : (!stream.resource<*>{%c8}) -> !stream.resource<*>{%c8}
+  %6 = stream.async.clone on(#hal.device.affinity<@__device_0>) %4 : !stream.resource<*>{%c8} -> !stream.resource<external>{%c8}
+  %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  %8 = stream.async.clone on(#hal.device.affinity<@__device_0>) %5 : !stream.resource<*>{%c8} -> !stream.resource<external>{%c8}
+  %9 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %8 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  util.return %7, %9 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After MaterializeEncodingsPass (iree-stream-materialize-encodings) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  stream.executable private @multiple_results_dispatch_0 {
+    stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      stream.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) {
+        %c0 = arith.constant 0 : index
+        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %3 = tensor.empty() : tensor<2xf32>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %5 = math.absf %in : f32
+          linalg.yield %5 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c0 = arith.constant 0 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    %1 = stream.async.clone on(#hal.device.affinity<@__device_0>) %0 : !stream.resource<external>{%c8} -> !stream.resource<*>{%c8}
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    %3 = stream.async.clone on(#hal.device.affinity<@__device_0>) %2 : !stream.resource<external>{%c8} -> !stream.resource<*>{%c8}
+    %4 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1[%c0 to %c8 for %c8]) : (!stream.resource<*>{%c8}) -> !stream.resource<*>{%c8}
+    %5 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%3[%c0 to %c8 for %c8]) : (!stream.resource<*>{%c8}) -> !stream.resource<*>{%c8}
+    %6 = stream.async.clone on(#hal.device.affinity<@__device_0>) %4 : !stream.resource<*>{%c8} -> !stream.resource<external>{%c8}
+    %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    %8 = stream.async.clone on(#hal.device.affinity<@__device_0>) %5 : !stream.resource<*>{%c8} -> !stream.resource<external>{%c8}
+    %9 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %8 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    util.return %7, %9 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After LayoutSlicesPass (iree-stream-layout-slices) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c0 = arith.constant 0 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  %1 = stream.async.clone on(#hal.device.affinity<@__device_0>) %0 : !stream.resource<external>{%c8} -> !stream.resource<*>{%c8}
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  %3 = stream.async.clone on(#hal.device.affinity<@__device_0>) %2 : !stream.resource<external>{%c8} -> !stream.resource<*>{%c8}
+  %4 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1[%c0 to %c8 for %c8]) : (!stream.resource<*>{%c8}) -> !stream.resource<*>{%c8}
+  %5 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%3[%c0 to %c8 for %c8]) : (!stream.resource<*>{%c8}) -> !stream.resource<*>{%c8}
+  %6 = stream.async.clone on(#hal.device.affinity<@__device_0>) %4 : !stream.resource<*>{%c8} -> !stream.resource<external>{%c8}
+  %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  %8 = stream.async.clone on(#hal.device.affinity<@__device_0>) %5 : !stream.resource<*>{%c8} -> !stream.resource<external>{%c8}
+  %9 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %8 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  util.return %7, %9 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c0 = arith.constant 0 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  %1 = stream.async.clone on(#hal.device.affinity<@__device_0>) %0 : !stream.resource<external>{%c8} -> !stream.resource<*>{%c8}
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  %3 = stream.async.clone on(#hal.device.affinity<@__device_0>) %2 : !stream.resource<external>{%c8} -> !stream.resource<*>{%c8}
+  %4 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1[%c0 to %c8 for %c8]) : (!stream.resource<*>{%c8}) -> !stream.resource<*>{%c8}
+  %5 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%3[%c0 to %c8 for %c8]) : (!stream.resource<*>{%c8}) -> !stream.resource<*>{%c8}
+  %6 = stream.async.clone on(#hal.device.affinity<@__device_0>) %4 : !stream.resource<*>{%c8} -> !stream.resource<external>{%c8}
+  %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  %8 = stream.async.clone on(#hal.device.affinity<@__device_0>) %5 : !stream.resource<*>{%c8} -> !stream.resource<external>{%c8}
+  %9 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %8 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  util.return %7, %9 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After CSE (cse) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c0 = arith.constant 0 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  %1 = stream.async.clone on(#hal.device.affinity<@__device_0>) %0 : !stream.resource<external>{%c8} -> !stream.resource<*>{%c8}
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  %3 = stream.async.clone on(#hal.device.affinity<@__device_0>) %2 : !stream.resource<external>{%c8} -> !stream.resource<*>{%c8}
+  %4 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1[%c0 to %c8 for %c8]) : (!stream.resource<*>{%c8}) -> !stream.resource<*>{%c8}
+  %5 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%3[%c0 to %c8 for %c8]) : (!stream.resource<*>{%c8}) -> !stream.resource<*>{%c8}
+  %6 = stream.async.clone on(#hal.device.affinity<@__device_0>) %4 : !stream.resource<*>{%c8} -> !stream.resource<external>{%c8}
+  %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  %8 = stream.async.clone on(#hal.device.affinity<@__device_0>) %5 : !stream.resource<*>{%c8} -> !stream.resource<external>{%c8}
+  %9 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %8 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  util.return %7, %9 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c0 = arith.constant 0 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  %1 = stream.async.clone on(#hal.device.affinity<@__device_0>) %0 : !stream.resource<external>{%c8} -> !stream.resource<*>{%c8}
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  %3 = stream.async.clone on(#hal.device.affinity<@__device_0>) %2 : !stream.resource<external>{%c8} -> !stream.resource<*>{%c8}
+  %4 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1[%c0 to %c8 for %c8]) : (!stream.resource<*>{%c8}) -> !stream.resource<*>{%c8}
+  %5 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%3[%c0 to %c8 for %c8]) : (!stream.resource<*>{%c8}) -> !stream.resource<*>{%c8}
+  %6 = stream.async.clone on(#hal.device.affinity<@__device_0>) %4 : !stream.resource<*>{%c8} -> !stream.resource<external>{%c8}
+  %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  %8 = stream.async.clone on(#hal.device.affinity<@__device_0>) %5 : !stream.resource<*>{%c8} -> !stream.resource<external>{%c8}
+  %9 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %8 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  util.return %7, %9 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c0 = arith.constant 0 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  %1 = stream.async.clone on(#hal.device.affinity<@__device_0>) %0 : !stream.resource<external>{%c8} -> !stream.resource<*>{%c8}
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  %3 = stream.async.clone on(#hal.device.affinity<@__device_0>) %2 : !stream.resource<external>{%c8} -> !stream.resource<*>{%c8}
+  %4 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1[%c0 to %c8 for %c8]) : (!stream.resource<*>{%c8}) -> !stream.resource<*>{%c8}
+  %5 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%3[%c0 to %c8 for %c8]) : (!stream.resource<*>{%c8}) -> !stream.resource<*>{%c8}
+  %6 = stream.async.clone on(#hal.device.affinity<@__device_0>) %4 : !stream.resource<*>{%c8} -> !stream.resource<external>{%c8}
+  %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  %8 = stream.async.clone on(#hal.device.affinity<@__device_0>) %5 : !stream.resource<*>{%c8} -> !stream.resource<external>{%c8}
+  %9 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %8 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  util.return %7, %9 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c0 = arith.constant 0 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  %1 = stream.async.clone on(#hal.device.affinity<@__device_0>) %0 : !stream.resource<external>{%c8} -> !stream.resource<*>{%c8}
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  %3 = stream.async.clone on(#hal.device.affinity<@__device_0>) %2 : !stream.resource<external>{%c8} -> !stream.resource<*>{%c8}
+  %4 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1[%c0 to %c8 for %c8]) : (!stream.resource<*>{%c8}) -> !stream.resource<*>{%c8}
+  %5 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%3[%c0 to %c8 for %c8]) : (!stream.resource<*>{%c8}) -> !stream.resource<*>{%c8}
+  %6 = stream.async.clone on(#hal.device.affinity<@__device_0>) %4 : !stream.resource<*>{%c8} -> !stream.resource<external>{%c8}
+  %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  %8 = stream.async.clone on(#hal.device.affinity<@__device_0>) %5 : !stream.resource<*>{%c8} -> !stream.resource<external>{%c8}
+  %9 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %8 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  util.return %7, %9 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  stream.executable private @multiple_results_dispatch_0 {
+    stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      stream.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) {
+        %c0 = arith.constant 0 : index
+        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %3 = tensor.empty() : tensor<2xf32>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %5 = math.absf %in : f32
+          linalg.yield %5 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c0 = arith.constant 0 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    %1 = stream.async.clone on(#hal.device.affinity<@__device_0>) %0 : !stream.resource<external>{%c8} -> !stream.resource<*>{%c8}
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    %3 = stream.async.clone on(#hal.device.affinity<@__device_0>) %2 : !stream.resource<external>{%c8} -> !stream.resource<*>{%c8}
+    %4 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1[%c0 to %c8 for %c8]) : (!stream.resource<*>{%c8}) -> !stream.resource<*>{%c8}
+    %5 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%3[%c0 to %c8 for %c8]) : (!stream.resource<*>{%c8}) -> !stream.resource<*>{%c8}
+    %6 = stream.async.clone on(#hal.device.affinity<@__device_0>) %4 : !stream.resource<*>{%c8} -> !stream.resource<external>{%c8}
+    %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    %8 = stream.async.clone on(#hal.device.affinity<@__device_0>) %5 : !stream.resource<*>{%c8} -> !stream.resource<external>{%c8}
+    %9 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %8 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    util.return %7, %9 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  stream.executable private @multiple_results_dispatch_0 {
+    stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      stream.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) {
+        %c0 = arith.constant 0 : index
+        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %3 = tensor.empty() : tensor<2xf32>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %5 = math.absf %in : f32
+          linalg.yield %5 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c0 = arith.constant 0 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    %1 = stream.async.clone on(#hal.device.affinity<@__device_0>) %0 : !stream.resource<external>{%c8} -> !stream.resource<*>{%c8}
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    %3 = stream.async.clone on(#hal.device.affinity<@__device_0>) %2 : !stream.resource<external>{%c8} -> !stream.resource<*>{%c8}
+    %4 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1[%c0 to %c8 for %c8]) : (!stream.resource<*>{%c8}) -> !stream.resource<*>{%c8}
+    %5 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%3[%c0 to %c8 for %c8]) : (!stream.resource<*>{%c8}) -> !stream.resource<*>{%c8}
+    %6 = stream.async.clone on(#hal.device.affinity<@__device_0>) %4 : !stream.resource<*>{%c8} -> !stream.resource<external>{%c8}
+    %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    %8 = stream.async.clone on(#hal.device.affinity<@__device_0>) %5 : !stream.resource<*>{%c8} -> !stream.resource<external>{%c8}
+    %9 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %8 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    util.return %7, %9 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After IPOPass (iree-util-ipo) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  stream.executable private @multiple_results_dispatch_0 {
+    stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      stream.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) {
+        %c0 = arith.constant 0 : index
+        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %3 = tensor.empty() : tensor<2xf32>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %5 = math.absf %in : f32
+          linalg.yield %5 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c0 = arith.constant 0 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    %1 = stream.async.clone on(#hal.device.affinity<@__device_0>) %0 : !stream.resource<external>{%c8} -> !stream.resource<*>{%c8}
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    %3 = stream.async.clone on(#hal.device.affinity<@__device_0>) %2 : !stream.resource<external>{%c8} -> !stream.resource<*>{%c8}
+    %4 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1[%c0 to %c8 for %c8]) : (!stream.resource<*>{%c8}) -> !stream.resource<*>{%c8}
+    %5 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%3[%c0 to %c8 for %c8]) : (!stream.resource<*>{%c8}) -> !stream.resource<*>{%c8}
+    %6 = stream.async.clone on(#hal.device.affinity<@__device_0>) %4 : !stream.resource<*>{%c8} -> !stream.resource<external>{%c8}
+    %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    %8 = stream.async.clone on(#hal.device.affinity<@__device_0>) %5 : !stream.resource<*>{%c8} -> !stream.resource<external>{%c8}
+    %9 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %8 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    util.return %7, %9 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After VerifyLoweringToAsyncResourcesPass (iree-stream-verify-lowering-to-async-resources) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  stream.executable private @multiple_results_dispatch_0 {
+    stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      stream.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) {
+        %c0 = arith.constant 0 : index
+        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %3 = tensor.empty() : tensor<2xf32>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %5 = math.absf %in : f32
+          linalg.yield %5 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c0 = arith.constant 0 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    %1 = stream.async.clone on(#hal.device.affinity<@__device_0>) %0 : !stream.resource<external>{%c8} -> !stream.resource<*>{%c8}
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    %3 = stream.async.clone on(#hal.device.affinity<@__device_0>) %2 : !stream.resource<external>{%c8} -> !stream.resource<*>{%c8}
+    %4 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1[%c0 to %c8 for %c8]) : (!stream.resource<*>{%c8}) -> !stream.resource<*>{%c8}
+    %5 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%3[%c0 to %c8 for %c8]) : (!stream.resource<*>{%c8}) -> !stream.resource<*>{%c8}
+    %6 = stream.async.clone on(#hal.device.affinity<@__device_0>) %4 : !stream.resource<*>{%c8} -> !stream.resource<external>{%c8}
+    %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    %8 = stream.async.clone on(#hal.device.affinity<@__device_0>) %5 : !stream.resource<*>{%c8} -> !stream.resource<external>{%c8}
+    %9 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %8 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    util.return %7, %9 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After MaterializeCopyOnWritePass (iree-stream-materialize-copy-on-write) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c0 = arith.constant 0 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  %1 = stream.async.clone on(#hal.device.affinity<@__device_0>) %0 : !stream.resource<external>{%c8} -> !stream.resource<*>{%c8}
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  %3 = stream.async.clone on(#hal.device.affinity<@__device_0>) %2 : !stream.resource<external>{%c8} -> !stream.resource<*>{%c8}
+  %4 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1[%c0 to %c8 for %c8]) : (!stream.resource<*>{%c8}) -> !stream.resource<*>{%c8}
+  %5 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%3[%c0 to %c8 for %c8]) : (!stream.resource<*>{%c8}) -> !stream.resource<*>{%c8}
+  %6 = stream.async.clone on(#hal.device.affinity<@__device_0>) %4 : !stream.resource<*>{%c8} -> !stream.resource<external>{%c8}
+  %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  %8 = stream.async.clone on(#hal.device.affinity<@__device_0>) %5 : !stream.resource<*>{%c8} -> !stream.resource<external>{%c8}
+  %9 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %8 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  util.return %7, %9 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c0 = arith.constant 0 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  %1 = stream.async.clone on(#hal.device.affinity<@__device_0>) %0 : !stream.resource<external>{%c8} -> !stream.resource<*>{%c8}
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  %3 = stream.async.clone on(#hal.device.affinity<@__device_0>) %2 : !stream.resource<external>{%c8} -> !stream.resource<*>{%c8}
+  %4 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1[%c0 to %c8 for %c8]) : (!stream.resource<*>{%c8}) -> !stream.resource<*>{%c8}
+  %5 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%3[%c0 to %c8 for %c8]) : (!stream.resource<*>{%c8}) -> !stream.resource<*>{%c8}
+  %6 = stream.async.clone on(#hal.device.affinity<@__device_0>) %4 : !stream.resource<*>{%c8} -> !stream.resource<external>{%c8}
+  %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  %8 = stream.async.clone on(#hal.device.affinity<@__device_0>) %5 : !stream.resource<*>{%c8} -> !stream.resource<external>{%c8}
+  %9 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %8 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  util.return %7, %9 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After ElideAsyncCopiesPass (iree-stream-elide-async-copies) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  stream.executable private @multiple_results_dispatch_0 {
+    stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      stream.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) {
+        %c0 = arith.constant 0 : index
+        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %3 = tensor.empty() : tensor<2xf32>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %5 = math.absf %in : f32
+          linalg.yield %5 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c0 = arith.constant 0 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    %1 = stream.async.clone on(#hal.device.affinity<@__device_0>) %0 : !stream.resource<external>{%c8} -> !stream.resource<*>{%c8}
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    %3 = stream.async.clone on(#hal.device.affinity<@__device_0>) %2 : !stream.resource<external>{%c8} -> !stream.resource<*>{%c8}
+    %4 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1[%c0 to %c8 for %c8]) : (!stream.resource<*>{%c8}) -> !stream.resource<*>{%c8}
+    %5 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%3[%c0 to %c8 for %c8]) : (!stream.resource<*>{%c8}) -> !stream.resource<*>{%c8}
+    %6 = stream.async.clone on(#hal.device.affinity<@__device_0>) %4 : !stream.resource<*>{%c8} -> !stream.resource<external>{%c8}
+    %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    %8 = stream.async.clone on(#hal.device.affinity<@__device_0>) %5 : !stream.resource<*>{%c8} -> !stream.resource<external>{%c8}
+    %9 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %8 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    util.return %7, %9 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c0 = arith.constant 0 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  %1 = stream.async.clone on(#hal.device.affinity<@__device_0>) %0 : !stream.resource<external>{%c8} -> !stream.resource<*>{%c8}
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  %3 = stream.async.clone on(#hal.device.affinity<@__device_0>) %2 : !stream.resource<external>{%c8} -> !stream.resource<*>{%c8}
+  %4 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1[%c0 to %c8 for %c8]) : (!stream.resource<*>{%c8}) -> !stream.resource<*>{%c8}
+  %5 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%3[%c0 to %c8 for %c8]) : (!stream.resource<*>{%c8}) -> !stream.resource<*>{%c8}
+  %6 = stream.async.clone on(#hal.device.affinity<@__device_0>) %4 : !stream.resource<*>{%c8} -> !stream.resource<external>{%c8}
+  %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  %8 = stream.async.clone on(#hal.device.affinity<@__device_0>) %5 : !stream.resource<*>{%c8} -> !stream.resource<external>{%c8}
+  %9 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %8 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  util.return %7, %9 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After EmplaceAllocationsPass (iree-stream-emplace-allocations) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c0 = arith.constant 0 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  %1 = stream.async.clone on(#hal.device.affinity<@__device_0>) %0 : !stream.resource<external>{%c8} -> !stream.resource<*>{%c8}
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  %3 = stream.async.clone on(#hal.device.affinity<@__device_0>) %2 : !stream.resource<external>{%c8} -> !stream.resource<*>{%c8}
+  %4 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1[%c0 to %c8 for %c8]) : (!stream.resource<*>{%c8}) -> !stream.resource<*>{%c8}
+  %5 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%3[%c0 to %c8 for %c8]) : (!stream.resource<*>{%c8}) -> !stream.resource<*>{%c8}
+  %6 = stream.async.clone on(#hal.device.affinity<@__device_0>) %4 : !stream.resource<*>{%c8} -> !stream.resource<external>{%c8}
+  %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  %8 = stream.async.clone on(#hal.device.affinity<@__device_0>) %5 : !stream.resource<*>{%c8} -> !stream.resource<external>{%c8}
+  %9 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %8 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  util.return %7, %9 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After RefineUsagePass (iree-stream-refine-usage) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  stream.executable private @multiple_results_dispatch_0 {
+    stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      stream.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) {
+        %c0 = arith.constant 0 : index
+        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %3 = tensor.empty() : tensor<2xf32>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %5 = math.absf %in : f32
+          linalg.yield %5 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c0 = arith.constant 0 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    %1 = stream.async.clone on(#hal.device.affinity<@__device_0>) %0 : !stream.resource<external>{%c8} -> !stream.resource<external>{%c8}
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    %3 = stream.async.clone on(#hal.device.affinity<@__device_0>) %2 : !stream.resource<external>{%c8} -> !stream.resource<external>{%c8}
+    %4 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1[%c0 to %c8 for %c8]) : (!stream.resource<external>{%c8}) -> !stream.resource<external>{%c8}
+    %5 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%3[%c0 to %c8 for %c8]) : (!stream.resource<external>{%c8}) -> !stream.resource<external>{%c8}
+    %6 = stream.async.clone on(#hal.device.affinity<@__device_0>) %4 : !stream.resource<external>{%c8} -> !stream.resource<external>{%c8}
+    %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    %8 = stream.async.clone on(#hal.device.affinity<@__device_0>) %5 : !stream.resource<external>{%c8} -> !stream.resource<external>{%c8}
+    %9 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %8 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    util.return %7, %9 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c0 = arith.constant 0 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  %1 = stream.async.clone on(#hal.device.affinity<@__device_0>) %0 : !stream.resource<external>{%c8} -> !stream.resource<external>{%c8}
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  %3 = stream.async.clone on(#hal.device.affinity<@__device_0>) %2 : !stream.resource<external>{%c8} -> !stream.resource<external>{%c8}
+  %4 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1[%c0 to %c8 for %c8]) : (!stream.resource<external>{%c8}) -> !stream.resource<external>{%c8}
+  %5 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%3[%c0 to %c8 for %c8]) : (!stream.resource<external>{%c8}) -> !stream.resource<external>{%c8}
+  %6 = stream.async.clone on(#hal.device.affinity<@__device_0>) %4 : !stream.resource<external>{%c8} -> !stream.resource<external>{%c8}
+  %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  %8 = stream.async.clone on(#hal.device.affinity<@__device_0>) %5 : !stream.resource<external>{%c8} -> !stream.resource<external>{%c8}
+  %9 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %8 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  util.return %7, %9 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After ElideAsyncCopiesPass (iree-stream-elide-async-copies) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  stream.executable private @multiple_results_dispatch_0 {
+    stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      stream.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) {
+        %c0 = arith.constant 0 : index
+        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %3 = tensor.empty() : tensor<2xf32>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %5 = math.absf %in : f32
+          linalg.yield %5 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c0 = arith.constant 0 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    %2 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%0[%c0 to %c8 for %c8]) : (!stream.resource<external>{%c8}) -> !stream.resource<external>{%c8}
+    %3 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1[%c0 to %c8 for %c8]) : (!stream.resource<external>{%c8}) -> !stream.resource<external>{%c8}
+    %4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    %5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %3 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    util.return %4, %5 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c0 = arith.constant 0 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  %2 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%0[%c0 to %c8 for %c8]) : (!stream.resource<external>{%c8}) -> !stream.resource<external>{%c8}
+  %3 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1[%c0 to %c8 for %c8]) : (!stream.resource<external>{%c8}) -> !stream.resource<external>{%c8}
+  %4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  %5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %3 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  util.return %4, %5 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After CSE (cse) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c0 = arith.constant 0 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  %2 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%0[%c0 to %c8 for %c8]) : (!stream.resource<external>{%c8}) -> !stream.resource<external>{%c8}
+  %3 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1[%c0 to %c8 for %c8]) : (!stream.resource<external>{%c8}) -> !stream.resource<external>{%c8}
+  %4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  %5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %3 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  util.return %4, %5 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c0 = arith.constant 0 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  %2 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%0[%c0 to %c8 for %c8]) : (!stream.resource<external>{%c8}) -> !stream.resource<external>{%c8}
+  %3 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1[%c0 to %c8 for %c8]) : (!stream.resource<external>{%c8}) -> !stream.resource<external>{%c8}
+  %4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  %5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %3 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  util.return %4, %5 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c0 = arith.constant 0 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  %2 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%0[%c0 to %c8 for %c8]) : (!stream.resource<external>{%c8}) -> !stream.resource<external>{%c8}
+  %3 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1[%c0 to %c8 for %c8]) : (!stream.resource<external>{%c8}) -> !stream.resource<external>{%c8}
+  %4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  %5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %3 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  util.return %4, %5 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c0 = arith.constant 0 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  %2 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%0[%c0 to %c8 for %c8]) : (!stream.resource<external>{%c8}) -> !stream.resource<external>{%c8}
+  %3 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1[%c0 to %c8 for %c8]) : (!stream.resource<external>{%c8}) -> !stream.resource<external>{%c8}
+  %4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  %5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %3 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  util.return %4, %5 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  stream.executable private @multiple_results_dispatch_0 {
+    stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      stream.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) {
+        %c0 = arith.constant 0 : index
+        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %3 = tensor.empty() : tensor<2xf32>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %5 = math.absf %in : f32
+          linalg.yield %5 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c0 = arith.constant 0 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    %2 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%0[%c0 to %c8 for %c8]) : (!stream.resource<external>{%c8}) -> !stream.resource<external>{%c8}
+    %3 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1[%c0 to %c8 for %c8]) : (!stream.resource<external>{%c8}) -> !stream.resource<external>{%c8}
+    %4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    %5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %3 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    util.return %4, %5 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  stream.executable private @multiple_results_dispatch_0 {
+    stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      stream.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) {
+        %c0 = arith.constant 0 : index
+        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %3 = tensor.empty() : tensor<2xf32>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %5 = math.absf %in : f32
+          linalg.yield %5 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c0 = arith.constant 0 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    %2 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%0[%c0 to %c8 for %c8]) : (!stream.resource<external>{%c8}) -> !stream.resource<external>{%c8}
+    %3 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1[%c0 to %c8 for %c8]) : (!stream.resource<external>{%c8}) -> !stream.resource<external>{%c8}
+    %4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    %5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %3 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    util.return %4, %5 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After IPOPass (iree-util-ipo) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  stream.executable private @multiple_results_dispatch_0 {
+    stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      stream.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) {
+        %c0 = arith.constant 0 : index
+        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %3 = tensor.empty() : tensor<2xf32>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %5 = math.absf %in : f32
+          linalg.yield %5 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c0 = arith.constant 0 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    %2 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%0[%c0 to %c8 for %c8]) : (!stream.resource<external>{%c8}) -> !stream.resource<external>{%c8}
+    %3 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1[%c0 to %c8 for %c8]) : (!stream.resource<external>{%c8}) -> !stream.resource<external>{%c8}
+    %4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    %5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %3 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    util.return %4, %5 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After VerifyAsyncAccessRangesPass (iree-stream-verify-async-access-ranges) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  stream.executable private @multiple_results_dispatch_0 {
+    stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      stream.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) {
+        %c0 = arith.constant 0 : index
+        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %3 = tensor.empty() : tensor<2xf32>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %5 = math.absf %in : f32
+          linalg.yield %5 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c0 = arith.constant 0 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    %2 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%0[%c0 to %c8 for %c8]) : (!stream.resource<external>{%c8}) -> !stream.resource<external>{%c8}
+    %3 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1[%c0 to %c8 for %c8]) : (!stream.resource<external>{%c8}) -> !stream.resource<external>{%c8}
+    %4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    %5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %3 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    util.return %4, %5 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After ScheduleExecutionPass (iree-stream-schedule-execution) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c0 = arith.constant 0 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  %results:2, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}) -> (!stream.resource<external>{%c8}, !stream.resource<external>{%c8}) {
+    %5 = stream.async.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%arg2[%c0 to %c8 for %c8]) : (!stream.resource<external>{%c8}) -> !stream.resource<external>{%c8}
+    %6 = stream.async.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%arg3[%c0 to %c8 for %c8]) : (!stream.resource<external>{%c8}) -> !stream.resource<external>{%c8}
+    stream.yield %5, %6 : !stream.resource<external>{%c8}, !stream.resource<external>{%c8}
+  } => !stream.timepoint
+  %2:2 = stream.timepoint.await %result_timepoint => %results#0, %results#1 : !stream.resource<external>{%c8}, !stream.resource<external>{%c8}
+  %3 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2#0 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  %4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2#1 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  util.return %3, %4 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After ScheduleConcurrencyPass (iree-stream-schedule-concurrency) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c0 = arith.constant 0 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  %results:2, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}) -> (!stream.resource<external>{%c8}, !stream.resource<external>{%c8}) {
+    %5:2 = stream.async.concurrent with(%arg2 as %arg4: !stream.resource<external>{%c8}, %arg3 as %arg5: !stream.resource<external>{%c8}) -> (!stream.resource<external>{%c8}, !stream.resource<external>{%c8}) {
+      %6 = stream.async.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%arg4[%c0 to %c8 for %c8]) : (!stream.resource<external>{%c8}) -> !stream.resource<external>{%c8}
+      %7 = stream.async.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%arg5[%c0 to %c8 for %c8]) : (!stream.resource<external>{%c8}) -> !stream.resource<external>{%c8}
+      stream.yield %6, %7 : !stream.resource<external>{%c8}, !stream.resource<external>{%c8}
+    }
+    stream.yield %5#0, %5#1 : !stream.resource<external>{%c8}, !stream.resource<external>{%c8}
+  } => !stream.timepoint
+  %2:2 = stream.timepoint.await %result_timepoint => %results#0, %results#1 : !stream.resource<external>{%c8}, !stream.resource<external>{%c8}
+  %3 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2#0 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  %4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2#1 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  util.return %3, %4 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After SyncInitializersPass (iree-stream-sync-initializers) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  stream.executable private @multiple_results_dispatch_0 {
+    stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      stream.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) {
+        %c0 = arith.constant 0 : index
+        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %3 = tensor.empty() : tensor<2xf32>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %5 = math.absf %in : f32
+          linalg.yield %5 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c0 = arith.constant 0 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    %results:2, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}) -> (!stream.resource<external>{%c8}, !stream.resource<external>{%c8}) {
+      %5:2 = stream.async.concurrent with(%arg2 as %arg4: !stream.resource<external>{%c8}, %arg3 as %arg5: !stream.resource<external>{%c8}) -> (!stream.resource<external>{%c8}, !stream.resource<external>{%c8}) {
+        %6 = stream.async.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%arg4[%c0 to %c8 for %c8]) : (!stream.resource<external>{%c8}) -> !stream.resource<external>{%c8}
+        %7 = stream.async.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%arg5[%c0 to %c8 for %c8]) : (!stream.resource<external>{%c8}) -> !stream.resource<external>{%c8}
+        stream.yield %6, %7 : !stream.resource<external>{%c8}, !stream.resource<external>{%c8}
+      }
+      stream.yield %5#0, %5#1 : !stream.resource<external>{%c8}, !stream.resource<external>{%c8}
+    } => !stream.timepoint
+    %2:2 = stream.timepoint.await %result_timepoint => %results#0, %results#1 : !stream.resource<external>{%c8}, !stream.resource<external>{%c8}
+    %3 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2#0 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    %4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2#1 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    util.return %3, %4 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After PropagateTimepointsPass (iree-stream-propagate-timepoints) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  stream.executable private @multiple_results_dispatch_0 {
+    stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      stream.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) {
+        %c0 = arith.constant 0 : index
+        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %3 = tensor.empty() : tensor<2xf32>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %5 = math.absf %in : f32
+          linalg.yield %5 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c0 = arith.constant 0 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    %2 = stream.timepoint.immediate => !stream.timepoint
+    %3 = stream.timepoint.immediate => !stream.timepoint
+    %4 = stream.timepoint.join max(%2, %3) => !stream.timepoint
+    %results:2, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) await(%4) => with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}) -> (!stream.resource<external>{%c8}, !stream.resource<external>{%c8}) {
+      %8:2 = stream.async.concurrent with(%arg2 as %arg4: !stream.resource<external>{%c8}, %arg3 as %arg5: !stream.resource<external>{%c8}) -> (!stream.resource<external>{%c8}, !stream.resource<external>{%c8}) {
+        %9 = stream.async.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%arg4[%c0 to %c8 for %c8]) : (!stream.resource<external>{%c8}) -> !stream.resource<external>{%c8}
+        %10 = stream.async.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%arg5[%c0 to %c8 for %c8]) : (!stream.resource<external>{%c8}) -> !stream.resource<external>{%c8}
+        stream.yield %9, %10 : !stream.resource<external>{%c8}, !stream.resource<external>{%c8}
+      }
+      stream.yield %8#0, %8#1 : !stream.resource<external>{%c8}, !stream.resource<external>{%c8}
+    } => !stream.timepoint
+    %5:2 = stream.timepoint.await %result_timepoint => %results#0, %results#1 : !stream.resource<external>{%c8}, !stream.resource<external>{%c8}
+    %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5#0 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5#1 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    util.return %6, %7 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After MaterializeBuiltinsPass (iree-stream-materialize-builtins) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  stream.executable private @multiple_results_dispatch_0 {
+    stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      stream.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) {
+        %c0 = arith.constant 0 : index
+        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %3 = tensor.empty() : tensor<2xf32>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %5 = math.absf %in : f32
+          linalg.yield %5 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c0 = arith.constant 0 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    %2 = stream.timepoint.immediate => !stream.timepoint
+    %3 = stream.timepoint.immediate => !stream.timepoint
+    %4 = stream.timepoint.join max(%2, %3) => !stream.timepoint
+    %results:2, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) await(%4) => with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}) -> (!stream.resource<external>{%c8}, !stream.resource<external>{%c8}) {
+      %8:2 = stream.async.concurrent with(%arg2 as %arg4: !stream.resource<external>{%c8}, %arg3 as %arg5: !stream.resource<external>{%c8}) -> (!stream.resource<external>{%c8}, !stream.resource<external>{%c8}) {
+        %9 = stream.async.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%arg4[%c0 to %c8 for %c8]) : (!stream.resource<external>{%c8}) -> !stream.resource<external>{%c8}
+        %10 = stream.async.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%arg5[%c0 to %c8 for %c8]) : (!stream.resource<external>{%c8}) -> !stream.resource<external>{%c8}
+        stream.yield %9, %10 : !stream.resource<external>{%c8}, !stream.resource<external>{%c8}
+      }
+      stream.yield %8#0, %8#1 : !stream.resource<external>{%c8}, !stream.resource<external>{%c8}
+    } => !stream.timepoint
+    %5:2 = stream.timepoint.await %result_timepoint => %results#0, %results#1 : !stream.resource<external>{%c8}, !stream.resource<external>{%c8}
+    %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5#0 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5#1 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    util.return %6, %7 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c0 = arith.constant 0 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  %results:2, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}) -> (!stream.resource<external>{%c8}, !stream.resource<external>{%c8}) {
+    %5:2 = stream.async.concurrent with(%arg2 as %arg4: !stream.resource<external>{%c8}, %arg3 as %arg5: !stream.resource<external>{%c8}) -> (!stream.resource<external>{%c8}, !stream.resource<external>{%c8}) {
+      %6 = stream.async.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%arg4[%c0 to %c8 for %c8]) : (!stream.resource<external>{%c8}) -> !stream.resource<external>{%c8}
+      %7 = stream.async.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%arg5[%c0 to %c8 for %c8]) : (!stream.resource<external>{%c8}) -> !stream.resource<external>{%c8}
+      stream.yield %6, %7 : !stream.resource<external>{%c8}, !stream.resource<external>{%c8}
+    }
+    stream.yield %5#0, %5#1 : !stream.resource<external>{%c8}, !stream.resource<external>{%c8}
+  } => !stream.timepoint
+  %2:2 = stream.timepoint.await %result_timepoint => %results#0, %results#1 : !stream.resource<external>{%c8}, !stream.resource<external>{%c8}
+  %3 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2#0 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  %4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2#1 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  util.return %3, %4 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After CSE (cse) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c0 = arith.constant 0 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  %results:2, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}) -> (!stream.resource<external>{%c8}, !stream.resource<external>{%c8}) {
+    %5:2 = stream.async.concurrent with(%arg2 as %arg4: !stream.resource<external>{%c8}, %arg3 as %arg5: !stream.resource<external>{%c8}) -> (!stream.resource<external>{%c8}, !stream.resource<external>{%c8}) {
+      %6 = stream.async.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%arg4[%c0 to %c8 for %c8]) : (!stream.resource<external>{%c8}) -> !stream.resource<external>{%c8}
+      %7 = stream.async.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%arg5[%c0 to %c8 for %c8]) : (!stream.resource<external>{%c8}) -> !stream.resource<external>{%c8}
+      stream.yield %6, %7 : !stream.resource<external>{%c8}, !stream.resource<external>{%c8}
+    }
+    stream.yield %5#0, %5#1 : !stream.resource<external>{%c8}, !stream.resource<external>{%c8}
+  } => !stream.timepoint
+  %2:2 = stream.timepoint.await %result_timepoint => %results#0, %results#1 : !stream.resource<external>{%c8}, !stream.resource<external>{%c8}
+  %3 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2#0 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  %4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2#1 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  util.return %3, %4 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c0 = arith.constant 0 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  %results:2, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}) -> (!stream.resource<external>{%c8}, !stream.resource<external>{%c8}) {
+    %5:2 = stream.async.concurrent with(%arg2 as %arg4: !stream.resource<external>{%c8}, %arg3 as %arg5: !stream.resource<external>{%c8}) -> (!stream.resource<external>{%c8}, !stream.resource<external>{%c8}) {
+      %6 = stream.async.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%arg4[%c0 to %c8 for %c8]) : (!stream.resource<external>{%c8}) -> !stream.resource<external>{%c8}
+      %7 = stream.async.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%arg5[%c0 to %c8 for %c8]) : (!stream.resource<external>{%c8}) -> !stream.resource<external>{%c8}
+      stream.yield %6, %7 : !stream.resource<external>{%c8}, !stream.resource<external>{%c8}
+    }
+    stream.yield %5#0, %5#1 : !stream.resource<external>{%c8}, !stream.resource<external>{%c8}
+  } => !stream.timepoint
+  %2:2 = stream.timepoint.await %result_timepoint => %results#0, %results#1 : !stream.resource<external>{%c8}, !stream.resource<external>{%c8}
+  %3 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2#0 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  %4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2#1 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  util.return %3, %4 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c0 = arith.constant 0 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  %results:2, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}) -> (!stream.resource<external>{%c8}, !stream.resource<external>{%c8}) {
+    %5:2 = stream.async.concurrent with(%arg2 as %arg4: !stream.resource<external>{%c8}, %arg3 as %arg5: !stream.resource<external>{%c8}) -> (!stream.resource<external>{%c8}, !stream.resource<external>{%c8}) {
+      %6 = stream.async.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%arg4[%c0 to %c8 for %c8]) : (!stream.resource<external>{%c8}) -> !stream.resource<external>{%c8}
+      %7 = stream.async.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%arg5[%c0 to %c8 for %c8]) : (!stream.resource<external>{%c8}) -> !stream.resource<external>{%c8}
+      stream.yield %6, %7 : !stream.resource<external>{%c8}, !stream.resource<external>{%c8}
+    }
+    stream.yield %5#0, %5#1 : !stream.resource<external>{%c8}, !stream.resource<external>{%c8}
+  } => !stream.timepoint
+  %2:2 = stream.timepoint.await %result_timepoint => %results#0, %results#1 : !stream.resource<external>{%c8}, !stream.resource<external>{%c8}
+  %3 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2#0 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  %4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2#1 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  util.return %3, %4 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c0 = arith.constant 0 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  %results:2, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}) -> (!stream.resource<external>{%c8}, !stream.resource<external>{%c8}) {
+    %5:2 = stream.async.concurrent with(%arg2 as %arg4: !stream.resource<external>{%c8}, %arg3 as %arg5: !stream.resource<external>{%c8}) -> (!stream.resource<external>{%c8}, !stream.resource<external>{%c8}) {
+      %6 = stream.async.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%arg4[%c0 to %c8 for %c8]) : (!stream.resource<external>{%c8}) -> !stream.resource<external>{%c8}
+      %7 = stream.async.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%arg5[%c0 to %c8 for %c8]) : (!stream.resource<external>{%c8}) -> !stream.resource<external>{%c8}
+      stream.yield %6, %7 : !stream.resource<external>{%c8}, !stream.resource<external>{%c8}
+    }
+    stream.yield %5#0, %5#1 : !stream.resource<external>{%c8}, !stream.resource<external>{%c8}
+  } => !stream.timepoint
+  %2:2 = stream.timepoint.await %result_timepoint => %results#0, %results#1 : !stream.resource<external>{%c8}, !stream.resource<external>{%c8}
+  %3 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2#0 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  %4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2#1 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  util.return %3, %4 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  stream.executable private @multiple_results_dispatch_0 {
+    stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      stream.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) {
+        %c0 = arith.constant 0 : index
+        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %3 = tensor.empty() : tensor<2xf32>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %5 = math.absf %in : f32
+          linalg.yield %5 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c0 = arith.constant 0 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    %results:2, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}) -> (!stream.resource<external>{%c8}, !stream.resource<external>{%c8}) {
+      %5:2 = stream.async.concurrent with(%arg2 as %arg4: !stream.resource<external>{%c8}, %arg3 as %arg5: !stream.resource<external>{%c8}) -> (!stream.resource<external>{%c8}, !stream.resource<external>{%c8}) {
+        %6 = stream.async.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%arg4[%c0 to %c8 for %c8]) : (!stream.resource<external>{%c8}) -> !stream.resource<external>{%c8}
+        %7 = stream.async.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%arg5[%c0 to %c8 for %c8]) : (!stream.resource<external>{%c8}) -> !stream.resource<external>{%c8}
+        stream.yield %6, %7 : !stream.resource<external>{%c8}, !stream.resource<external>{%c8}
+      }
+      stream.yield %5#0, %5#1 : !stream.resource<external>{%c8}, !stream.resource<external>{%c8}
+    } => !stream.timepoint
+    %2:2 = stream.timepoint.await %result_timepoint => %results#0, %results#1 : !stream.resource<external>{%c8}, !stream.resource<external>{%c8}
+    %3 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2#0 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    %4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2#1 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    util.return %3, %4 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  stream.executable private @multiple_results_dispatch_0 {
+    stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      stream.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) {
+        %c0 = arith.constant 0 : index
+        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %3 = tensor.empty() : tensor<2xf32>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %5 = math.absf %in : f32
+          linalg.yield %5 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c0 = arith.constant 0 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    %results:2, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}) -> (!stream.resource<external>{%c8}, !stream.resource<external>{%c8}) {
+      %5:2 = stream.async.concurrent with(%arg2 as %arg4: !stream.resource<external>{%c8}, %arg3 as %arg5: !stream.resource<external>{%c8}) -> (!stream.resource<external>{%c8}, !stream.resource<external>{%c8}) {
+        %6 = stream.async.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%arg4[%c0 to %c8 for %c8]) : (!stream.resource<external>{%c8}) -> !stream.resource<external>{%c8}
+        %7 = stream.async.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%arg5[%c0 to %c8 for %c8]) : (!stream.resource<external>{%c8}) -> !stream.resource<external>{%c8}
+        stream.yield %6, %7 : !stream.resource<external>{%c8}, !stream.resource<external>{%c8}
+      }
+      stream.yield %5#0, %5#1 : !stream.resource<external>{%c8}, !stream.resource<external>{%c8}
+    } => !stream.timepoint
+    %2:2 = stream.timepoint.await %result_timepoint => %results#0, %results#1 : !stream.resource<external>{%c8}, !stream.resource<external>{%c8}
+    %3 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2#0 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    %4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2#1 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    util.return %3, %4 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After IPOPass (iree-util-ipo) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  stream.executable private @multiple_results_dispatch_0 {
+    stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      stream.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) {
+        %c0 = arith.constant 0 : index
+        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %3 = tensor.empty() : tensor<2xf32>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %5 = math.absf %in : f32
+          linalg.yield %5 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c0 = arith.constant 0 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    %results:2, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}) -> (!stream.resource<external>{%c8}, !stream.resource<external>{%c8}) {
+      %5:2 = stream.async.concurrent with(%arg2 as %arg4: !stream.resource<external>{%c8}, %arg3 as %arg5: !stream.resource<external>{%c8}) -> (!stream.resource<external>{%c8}, !stream.resource<external>{%c8}) {
+        %6 = stream.async.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%arg4[%c0 to %c8 for %c8]) : (!stream.resource<external>{%c8}) -> !stream.resource<external>{%c8}
+        %7 = stream.async.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%arg5[%c0 to %c8 for %c8]) : (!stream.resource<external>{%c8}) -> !stream.resource<external>{%c8}
+        stream.yield %6, %7 : !stream.resource<external>{%c8}, !stream.resource<external>{%c8}
+      }
+      stream.yield %5#0, %5#1 : !stream.resource<external>{%c8}, !stream.resource<external>{%c8}
+    } => !stream.timepoint
+    %2:2 = stream.timepoint.await %result_timepoint => %results#0, %results#1 : !stream.resource<external>{%c8}, !stream.resource<external>{%c8}
+    %3 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2#0 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    %4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2#1 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    util.return %3, %4 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After VerifyLoweringToAsyncPass (iree-stream-verify-lowering-to-async) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  stream.executable private @multiple_results_dispatch_0 {
+    stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      stream.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) {
+        %c0 = arith.constant 0 : index
+        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %3 = tensor.empty() : tensor<2xf32>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %5 = math.absf %in : f32
+          linalg.yield %5 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c0 = arith.constant 0 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    %results:2, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}) -> (!stream.resource<external>{%c8}, !stream.resource<external>{%c8}) {
+      %5:2 = stream.async.concurrent with(%arg2 as %arg4: !stream.resource<external>{%c8}, %arg3 as %arg5: !stream.resource<external>{%c8}) -> (!stream.resource<external>{%c8}, !stream.resource<external>{%c8}) {
+        %6 = stream.async.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%arg4[%c0 to %c8 for %c8]) : (!stream.resource<external>{%c8}) -> !stream.resource<external>{%c8}
+        %7 = stream.async.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%arg5[%c0 to %c8 for %c8]) : (!stream.resource<external>{%c8}) -> !stream.resource<external>{%c8}
+        stream.yield %6, %7 : !stream.resource<external>{%c8}, !stream.resource<external>{%c8}
+      }
+      stream.yield %5#0, %5#1 : !stream.resource<external>{%c8}, !stream.resource<external>{%c8}
+    } => !stream.timepoint
+    %2:2 = stream.timepoint.await %result_timepoint => %results#0, %results#1 : !stream.resource<external>{%c8}, !stream.resource<external>{%c8}
+    %3 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2#0 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    %4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2#1 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    util.return %3, %4 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After ScheduleAllocationPass (iree-stream-schedule-allocation) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  stream.executable private @multiple_results_dispatch_0 {
+    stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      stream.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) {
+        %c0 = arith.constant 0 : index
+        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %3 = tensor.empty() : tensor<2xf32>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %5 = math.absf %in : f32
+          linalg.yield %5 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c0 = arith.constant 0 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    %c0_0 = arith.constant 0 : index
+    %2:3 = stream.resource.pack on(#hal.device.affinity<@__device_0>) slices({
+      [0, 0] = %c8,
+      [0, 0] = %c8
+    }) : index
+    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%2#0} => !stream.timepoint
+    %3 = stream.resource.subview %result[%2#1] : !stream.resource<external>{%2#0} -> !stream.resource<external>{%c8}
+    %4 = stream.resource.subview %result[%2#2] : !stream.resource<external>{%2#0} -> !stream.resource<external>{%c8}
+    %5 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}, %3 as %arg4: !stream.resource<external>{%c8}, %4 as %arg5: !stream.resource<external>{%c8}) {
+      stream.cmd.concurrent {
+        stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 {
+          ro %arg2[%c0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg4[%c0_0 for %c8] : !stream.resource<external>{%c8}
+        }
+        stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 {
+          ro %arg3[%c0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg5[%c0_0 for %c8] : !stream.resource<external>{%c8}
+        }
+      }
+    } => !stream.timepoint
+    %6:2 = stream.timepoint.await %5 => %3, %4 : !stream.resource<external>{%c8}, !stream.resource<external>{%c8}
+    %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6#0 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    %8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6#1 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    util.return %7, %8 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After EmplaceTransientsPass (iree-stream-emplace-transients) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  stream.executable private @multiple_results_dispatch_0 {
+    stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      stream.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) {
+        %c0 = arith.constant 0 : index
+        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %3 = tensor.empty() : tensor<2xf32>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %5 = math.absf %in : f32
+          linalg.yield %5 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c0 = arith.constant 0 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    %c0_0 = arith.constant 0 : index
+    %2:3 = stream.resource.pack on(#hal.device.affinity<@__device_0>) slices({
+      [0, 0] = %c8,
+      [0, 0] = %c8
+    }) : index
+    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%2#0} => !stream.timepoint
+    %3 = stream.resource.subview %result[%2#1] : !stream.resource<external>{%2#0} -> !stream.resource<external>{%c8}
+    %4 = stream.resource.subview %result[%2#2] : !stream.resource<external>{%2#0} -> !stream.resource<external>{%c8}
+    %5 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}, %3 as %arg4: !stream.resource<external>{%c8}, %4 as %arg5: !stream.resource<external>{%c8}) {
+      stream.cmd.concurrent {
+        stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 {
+          ro %arg2[%c0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg4[%c0_0 for %c8] : !stream.resource<external>{%c8}
+        }
+        stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 {
+          ro %arg3[%c0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg5[%c0_0 for %c8] : !stream.resource<external>{%c8}
+        }
+      }
+    } => !stream.timepoint
+    %6:2 = stream.timepoint.await %5 => %3, %4 : !stream.resource<external>{%c8}, !stream.resource<external>{%c8}
+    %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6#0 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    %8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6#1 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    util.return %7, %8 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After MaterializeTransientSizeQueriesPass (iree-stream-materialize-transient-size-queries) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  stream.executable private @multiple_results_dispatch_0 {
+    stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      stream.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) {
+        %c0 = arith.constant 0 : index
+        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %3 = tensor.empty() : tensor<2xf32>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %5 = math.absf %in : f32
+          linalg.yield %5 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c0 = arith.constant 0 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    %c0_0 = arith.constant 0 : index
+    %2:3 = stream.resource.pack on(#hal.device.affinity<@__device_0>) slices({
+      [0, 0] = %c8,
+      [0, 0] = %c8
+    }) : index
+    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%2#0} => !stream.timepoint
+    %3 = stream.resource.subview %result[%2#1] : !stream.resource<external>{%2#0} -> !stream.resource<external>{%c8}
+    %4 = stream.resource.subview %result[%2#2] : !stream.resource<external>{%2#0} -> !stream.resource<external>{%c8}
+    %5 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}, %3 as %arg4: !stream.resource<external>{%c8}, %4 as %arg5: !stream.resource<external>{%c8}) {
+      stream.cmd.concurrent {
+        stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 {
+          ro %arg2[%c0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg4[%c0_0 for %c8] : !stream.resource<external>{%c8}
+        }
+        stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 {
+          ro %arg3[%c0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg5[%c0_0 for %c8] : !stream.resource<external>{%c8}
+        }
+      }
+    } => !stream.timepoint
+    %6:2 = stream.timepoint.await %5 => %3, %4 : !stream.resource<external>{%c8}, !stream.resource<external>{%c8}
+    %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6#0 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    %8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6#1 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    util.return %7, %8 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After PackConstantsPass (iree-stream-pack-constants) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c0 = arith.constant 0 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  %c0_0 = arith.constant 0 : index
+  %2:3 = stream.resource.pack on(#hal.device.affinity<@__device_0>) slices({
+    [0, 0] = %c8,
+    [0, 0] = %c8
+  }) : index
+  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%2#0} => !stream.timepoint
+  %3 = stream.resource.subview %result[%2#1] : !stream.resource<external>{%2#0} -> !stream.resource<external>{%c8}
+  %4 = stream.resource.subview %result[%2#2] : !stream.resource<external>{%2#0} -> !stream.resource<external>{%c8}
+  %5 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}, %3 as %arg4: !stream.resource<external>{%c8}, %4 as %arg5: !stream.resource<external>{%c8}) {
+    stream.cmd.concurrent {
+      stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 {
+        ro %arg2[%c0 for %c8] : !stream.resource<external>{%c8},
+        wo %arg4[%c0_0 for %c8] : !stream.resource<external>{%c8}
+      }
+      stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 {
+        ro %arg3[%c0 for %c8] : !stream.resource<external>{%c8},
+        wo %arg5[%c0_0 for %c8] : !stream.resource<external>{%c8}
+      }
+    }
+  } => !stream.timepoint
+  %6:2 = stream.timepoint.await %5 => %3, %4 : !stream.resource<external>{%c8}, !stream.resource<external>{%c8}
+  %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6#0 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  %8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6#1 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  util.return %7, %8 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After LayoutSlicesPass (iree-stream-layout-slices) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c0 = arith.constant 0 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  %c0_0 = arith.constant 0 : index
+  %c0_1 = arith.constant 0 : index
+  %c64 = arith.constant 64 : index
+  %c64_2 = arith.constant 64 : index
+  %c128 = arith.constant 128 : index
+  %c128_3 = arith.constant 128 : index
+  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c128_3} => !stream.timepoint
+  %2 = stream.resource.subview %result[%c0_1] : !stream.resource<external>{%c128_3} -> !stream.resource<external>{%c8}
+  %3 = stream.resource.subview %result[%c64_2] : !stream.resource<external>{%c128_3} -> !stream.resource<external>{%c8}
+  %4 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}, %2 as %arg4: !stream.resource<external>{%c8}, %3 as %arg5: !stream.resource<external>{%c8}) {
+    stream.cmd.concurrent {
+      stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 {
+        ro %arg2[%c0 for %c8] : !stream.resource<external>{%c8},
+        wo %arg4[%c0_0 for %c8] : !stream.resource<external>{%c8}
+      }
+      stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 {
+        ro %arg3[%c0 for %c8] : !stream.resource<external>{%c8},
+        wo %arg5[%c0_0 for %c8] : !stream.resource<external>{%c8}
+      }
+    }
+  } => !stream.timepoint
+  %5:2 = stream.timepoint.await %4 => %2, %3 : !stream.resource<external>{%c8}, !stream.resource<external>{%c8}
+  %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5#0 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5#1 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  util.return %6, %7 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c128 = arith.constant 128 : index
+  %c64 = arith.constant 64 : index
+  %c0 = arith.constant 0 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c128} => !stream.timepoint
+  %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}, %result as %arg4: !stream.resource<external>{%c128}) {
+    stream.cmd.concurrent {
+      stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 {
+        ro %arg2[%c0 for %c8] : !stream.resource<external>{%c8},
+        wo %arg4[%c0 for %c8] : !stream.resource<external>{%c128}
+      }
+      stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 {
+        ro %arg3[%c0 for %c8] : !stream.resource<external>{%c8},
+        wo %arg4[%c64 for %c8] : !stream.resource<external>{%c128}
+      }
+    }
+  } => !stream.timepoint
+  %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c128}
+  %4 = stream.resource.subview %3[%c0] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+  %5 = stream.resource.subview %3[%c64] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+  %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  util.return %6, %7 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After PropagateSubrangesPass (iree-util-propagate-subranges) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  stream.executable private @multiple_results_dispatch_0 {
+    stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      stream.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) {
+        %c0 = arith.constant 0 : index
+        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %3 = tensor.empty() : tensor<2xf32>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %5 = math.absf %in : f32
+          linalg.yield %5 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c64 = arith.constant 64 : index
+    %c0_0 = arith.constant 0 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c128} => !stream.timepoint
+    %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}, %result as %arg4: !stream.resource<external>{%c128}) {
+      stream.cmd.concurrent {
+        stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 {
+          ro %arg2[%c0_0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg4[%c0_0 for %c8] : !stream.resource<external>{%c128}
+        }
+        stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 {
+          ro %arg3[%c0_0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg4[%c64 for %c8] : !stream.resource<external>{%c128}
+        }
+      }
+    } => !stream.timepoint
+    %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c128}
+    %4 = stream.resource.subview %3[%c0_0] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+    %5 = stream.resource.subview %3[%c64] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+    %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    util.return %6, %7 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c0 = arith.constant 0 : index
+  %c128 = arith.constant 128 : index
+  %c64 = arith.constant 64 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c128} => !stream.timepoint
+  %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}, %result as %arg4: !stream.resource<external>{%c128}) {
+    stream.cmd.concurrent {
+      stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 {
+        ro %arg2[%c0 for %c8] : !stream.resource<external>{%c8},
+        wo %arg4[%c0 for %c8] : !stream.resource<external>{%c128}
+      }
+      stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 {
+        ro %arg3[%c0 for %c8] : !stream.resource<external>{%c8},
+        wo %arg4[%c64 for %c8] : !stream.resource<external>{%c128}
+      }
+    }
+  } => !stream.timepoint
+  %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c128}
+  %4 = stream.resource.subview %3[%c0] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+  %5 = stream.resource.subview %3[%c64] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+  %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  util.return %6, %7 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After CSE (cse) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c0 = arith.constant 0 : index
+  %c128 = arith.constant 128 : index
+  %c64 = arith.constant 64 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c128} => !stream.timepoint
+  %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}, %result as %arg4: !stream.resource<external>{%c128}) {
+    stream.cmd.concurrent {
+      stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 {
+        ro %arg2[%c0 for %c8] : !stream.resource<external>{%c8},
+        wo %arg4[%c0 for %c8] : !stream.resource<external>{%c128}
+      }
+      stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 {
+        ro %arg3[%c0 for %c8] : !stream.resource<external>{%c8},
+        wo %arg4[%c64 for %c8] : !stream.resource<external>{%c128}
+      }
+    }
+  } => !stream.timepoint
+  %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c128}
+  %4 = stream.resource.subview %3[%c0] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+  %5 = stream.resource.subview %3[%c64] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+  %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  util.return %6, %7 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c0 = arith.constant 0 : index
+  %c128 = arith.constant 128 : index
+  %c64 = arith.constant 64 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c128} => !stream.timepoint
+  %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}, %result as %arg4: !stream.resource<external>{%c128}) {
+    stream.cmd.concurrent {
+      stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 {
+        ro %arg2[%c0 for %c8] : !stream.resource<external>{%c8},
+        wo %arg4[%c0 for %c8] : !stream.resource<external>{%c128}
+      }
+      stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 {
+        ro %arg3[%c0 for %c8] : !stream.resource<external>{%c8},
+        wo %arg4[%c64 for %c8] : !stream.resource<external>{%c128}
+      }
+    }
+  } => !stream.timepoint
+  %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c128}
+  %4 = stream.resource.subview %3[%c0] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+  %5 = stream.resource.subview %3[%c64] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+  %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  util.return %6, %7 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c0 = arith.constant 0 : index
+  %c128 = arith.constant 128 : index
+  %c64 = arith.constant 64 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c128} => !stream.timepoint
+  %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}, %result as %arg4: !stream.resource<external>{%c128}) {
+    stream.cmd.concurrent {
+      stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 {
+        ro %arg2[%c0 for %c8] : !stream.resource<external>{%c8},
+        wo %arg4[%c0 for %c8] : !stream.resource<external>{%c128}
+      }
+      stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 {
+        ro %arg3[%c0 for %c8] : !stream.resource<external>{%c8},
+        wo %arg4[%c64 for %c8] : !stream.resource<external>{%c128}
+      }
+    }
+  } => !stream.timepoint
+  %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c128}
+  %4 = stream.resource.subview %3[%c0] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+  %5 = stream.resource.subview %3[%c64] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+  %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  util.return %6, %7 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c0 = arith.constant 0 : index
+  %c128 = arith.constant 128 : index
+  %c64 = arith.constant 64 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c128} => !stream.timepoint
+  %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}, %result as %arg4: !stream.resource<external>{%c128}) {
+    stream.cmd.concurrent {
+      stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 {
+        ro %arg2[%c0 for %c8] : !stream.resource<external>{%c8},
+        wo %arg4[%c0 for %c8] : !stream.resource<external>{%c128}
+      }
+      stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 {
+        ro %arg3[%c0 for %c8] : !stream.resource<external>{%c8},
+        wo %arg4[%c64 for %c8] : !stream.resource<external>{%c128}
+      }
+    }
+  } => !stream.timepoint
+  %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c128}
+  %4 = stream.resource.subview %3[%c0] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+  %5 = stream.resource.subview %3[%c64] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+  %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  util.return %6, %7 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  stream.executable private @multiple_results_dispatch_0 {
+    stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      stream.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) {
+        %c0 = arith.constant 0 : index
+        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %3 = tensor.empty() : tensor<2xf32>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %5 = math.absf %in : f32
+          linalg.yield %5 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c64 = arith.constant 64 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c128} => !stream.timepoint
+    %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}, %result as %arg4: !stream.resource<external>{%c128}) {
+      stream.cmd.concurrent {
+        stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 {
+          ro %arg2[%c0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg4[%c0 for %c8] : !stream.resource<external>{%c128}
+        }
+        stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 {
+          ro %arg3[%c0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg4[%c64 for %c8] : !stream.resource<external>{%c128}
+        }
+      }
+    } => !stream.timepoint
+    %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c128}
+    %4 = stream.resource.subview %3[%c0] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+    %5 = stream.resource.subview %3[%c64] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+    %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    util.return %6, %7 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  stream.executable private @multiple_results_dispatch_0 {
+    stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      stream.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) {
+        %c0 = arith.constant 0 : index
+        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %3 = tensor.empty() : tensor<2xf32>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %5 = math.absf %in : f32
+          linalg.yield %5 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c64 = arith.constant 64 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c128} => !stream.timepoint
+    %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}, %result as %arg4: !stream.resource<external>{%c128}) {
+      stream.cmd.concurrent {
+        stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 {
+          ro %arg2[%c0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg4[%c0 for %c8] : !stream.resource<external>{%c128}
+        }
+        stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 {
+          ro %arg3[%c0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg4[%c64 for %c8] : !stream.resource<external>{%c128}
+        }
+      }
+    } => !stream.timepoint
+    %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c128}
+    %4 = stream.resource.subview %3[%c0] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+    %5 = stream.resource.subview %3[%c64] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+    %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    util.return %6, %7 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After IPOPass (iree-util-ipo) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  stream.executable private @multiple_results_dispatch_0 {
+    stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      stream.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) {
+        %c0 = arith.constant 0 : index
+        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %3 = tensor.empty() : tensor<2xf32>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %5 = math.absf %in : f32
+          linalg.yield %5 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c64 = arith.constant 64 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c128} => !stream.timepoint
+    %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}, %result as %arg4: !stream.resource<external>{%c128}) {
+      stream.cmd.concurrent {
+        stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 {
+          ro %arg2[%c0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg4[%c0 for %c8] : !stream.resource<external>{%c128}
+        }
+        stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 {
+          ro %arg3[%c0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg4[%c64 for %c8] : !stream.resource<external>{%c128}
+        }
+      }
+    } => !stream.timepoint
+    %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c128}
+    %4 = stream.resource.subview %3[%c0] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+    %5 = stream.resource.subview %3[%c64] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+    %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    util.return %6, %7 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After AutomaticReferenceCountingPass (iree-stream-automatic-reference-counting) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  stream.executable private @multiple_results_dispatch_0 {
+    stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      stream.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) {
+        %c0 = arith.constant 0 : index
+        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %3 = tensor.empty() : tensor<2xf32>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %5 = math.absf %in : f32
+          linalg.yield %5 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c64 = arith.constant 64 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c128} => !stream.timepoint
+    %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}, %result as %arg4: !stream.resource<external>{%c128}) {
+      stream.cmd.concurrent {
+        stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 {
+          ro %arg2[%c0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg4[%c0 for %c8] : !stream.resource<external>{%c128}
+        }
+        stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 {
+          ro %arg3[%c0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg4[%c64 for %c8] : !stream.resource<external>{%c128}
+        }
+      }
+    } => !stream.timepoint
+    %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c128}
+    %4 = stream.resource.subview %3[%c0] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+    %5 = stream.resource.subview %3[%c64] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+    %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    util.return %6, %7 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After AnnotateConstantTransientSizePass (iree-stream-annotate-constant-transient-size) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  stream.executable private @multiple_results_dispatch_0 {
+    stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      stream.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) {
+        %c0 = arith.constant 0 : index
+        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %3 = tensor.empty() : tensor<2xf32>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %5 = math.absf %in : f32
+          linalg.yield %5 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c64 = arith.constant 64 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c128} => !stream.timepoint
+    %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}, %result as %arg4: !stream.resource<external>{%c128}) {
+      stream.cmd.concurrent {
+        stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 {
+          ro %arg2[%c0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg4[%c0 for %c8] : !stream.resource<external>{%c128}
+        }
+        stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 {
+          ro %arg3[%c0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg4[%c64 for %c8] : !stream.resource<external>{%c128}
+        }
+      }
+    } => !stream.timepoint
+    %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c128}
+    %4 = stream.resource.subview %3[%c0] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+    %5 = stream.resource.subview %3[%c64] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+    %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    util.return %6, %7 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After VerifyLoweringToCmdPass (iree-stream-verify-lowering-to-cmd) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  stream.executable private @multiple_results_dispatch_0 {
+    stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      stream.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) {
+        %c0 = arith.constant 0 : index
+        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %3 = tensor.empty() : tensor<2xf32>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %5 = math.absf %in : f32
+          linalg.yield %5 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c64 = arith.constant 64 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c128} => !stream.timepoint
+    %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}, %result as %arg4: !stream.resource<external>{%c128}) {
+      stream.cmd.concurrent {
+        stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 {
+          ro %arg2[%c0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg4[%c0 for %c8] : !stream.resource<external>{%c128}
+        }
+        stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 {
+          ro %arg3[%c0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg4[%c64 for %c8] : !stream.resource<external>{%c128}
+        }
+      }
+    } => !stream.timepoint
+    %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c128}
+    %4 = stream.resource.subview %3[%c0] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+    %5 = stream.resource.subview %3[%c64] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+    %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    util.return %6, %7 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c0 = arith.constant 0 : index
+  %c128 = arith.constant 128 : index
+  %c64 = arith.constant 64 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c128} => !stream.timepoint
+  %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}, %result as %arg4: !stream.resource<external>{%c128}) {
+    stream.cmd.concurrent {
+      stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 {
+        ro %arg2[%c0 for %c8] : !stream.resource<external>{%c8},
+        wo %arg4[%c0 for %c8] : !stream.resource<external>{%c128}
+      }
+      stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 {
+        ro %arg3[%c0 for %c8] : !stream.resource<external>{%c8},
+        wo %arg4[%c64 for %c8] : !stream.resource<external>{%c128}
+      }
+    }
+  } => !stream.timepoint
+  %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c128}
+  %4 = stream.resource.subview %3[%c0] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+  %5 = stream.resource.subview %3[%c64] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+  %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  util.return %6, %7 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After CSE (cse) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c0 = arith.constant 0 : index
+  %c128 = arith.constant 128 : index
+  %c64 = arith.constant 64 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c128} => !stream.timepoint
+  %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}, %result as %arg4: !stream.resource<external>{%c128}) {
+    stream.cmd.concurrent {
+      stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 {
+        ro %arg2[%c0 for %c8] : !stream.resource<external>{%c8},
+        wo %arg4[%c0 for %c8] : !stream.resource<external>{%c128}
+      }
+      stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 {
+        ro %arg3[%c0 for %c8] : !stream.resource<external>{%c8},
+        wo %arg4[%c64 for %c8] : !stream.resource<external>{%c128}
+      }
+    }
+  } => !stream.timepoint
+  %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c128}
+  %4 = stream.resource.subview %3[%c0] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+  %5 = stream.resource.subview %3[%c64] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+  %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  util.return %6, %7 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c0 = arith.constant 0 : index
+  %c128 = arith.constant 128 : index
+  %c64 = arith.constant 64 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c128} => !stream.timepoint
+  %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}, %result as %arg4: !stream.resource<external>{%c128}) {
+    stream.cmd.concurrent {
+      stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 {
+        ro %arg2[%c0 for %c8] : !stream.resource<external>{%c8},
+        wo %arg4[%c0 for %c8] : !stream.resource<external>{%c128}
+      }
+      stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 {
+        ro %arg3[%c0 for %c8] : !stream.resource<external>{%c8},
+        wo %arg4[%c64 for %c8] : !stream.resource<external>{%c128}
+      }
+    }
+  } => !stream.timepoint
+  %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c128}
+  %4 = stream.resource.subview %3[%c0] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+  %5 = stream.resource.subview %3[%c64] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+  %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  util.return %6, %7 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c0 = arith.constant 0 : index
+  %c128 = arith.constant 128 : index
+  %c64 = arith.constant 64 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c128} => !stream.timepoint
+  %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}, %result as %arg4: !stream.resource<external>{%c128}) {
+    stream.cmd.concurrent {
+      stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 {
+        ro %arg2[%c0 for %c8] : !stream.resource<external>{%c8},
+        wo %arg4[%c0 for %c8] : !stream.resource<external>{%c128}
+      }
+      stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 {
+        ro %arg3[%c0 for %c8] : !stream.resource<external>{%c8},
+        wo %arg4[%c64 for %c8] : !stream.resource<external>{%c128}
+      }
+    }
+  } => !stream.timepoint
+  %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c128}
+  %4 = stream.resource.subview %3[%c0] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+  %5 = stream.resource.subview %3[%c64] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+  %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  util.return %6, %7 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c0 = arith.constant 0 : index
+  %c128 = arith.constant 128 : index
+  %c64 = arith.constant 64 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c128} => !stream.timepoint
+  %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}, %result as %arg4: !stream.resource<external>{%c128}) {
+    stream.cmd.concurrent {
+      stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 {
+        ro %arg2[%c0 for %c8] : !stream.resource<external>{%c8},
+        wo %arg4[%c0 for %c8] : !stream.resource<external>{%c128}
+      }
+      stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 {
+        ro %arg3[%c0 for %c8] : !stream.resource<external>{%c8},
+        wo %arg4[%c64 for %c8] : !stream.resource<external>{%c128}
+      }
+    }
+  } => !stream.timepoint
+  %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c128}
+  %4 = stream.resource.subview %3[%c0] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+  %5 = stream.resource.subview %3[%c64] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+  %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  util.return %6, %7 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  stream.executable private @multiple_results_dispatch_0 {
+    stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      stream.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) {
+        %c0 = arith.constant 0 : index
+        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %3 = tensor.empty() : tensor<2xf32>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %5 = math.absf %in : f32
+          linalg.yield %5 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c64 = arith.constant 64 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c128} => !stream.timepoint
+    %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}, %result as %arg4: !stream.resource<external>{%c128}) {
+      stream.cmd.concurrent {
+        stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 {
+          ro %arg2[%c0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg4[%c0 for %c8] : !stream.resource<external>{%c128}
+        }
+        stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 {
+          ro %arg3[%c0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg4[%c64 for %c8] : !stream.resource<external>{%c128}
+        }
+      }
+    } => !stream.timepoint
+    %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c128}
+    %4 = stream.resource.subview %3[%c0] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+    %5 = stream.resource.subview %3[%c64] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+    %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    util.return %6, %7 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  stream.executable private @multiple_results_dispatch_0 {
+    stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      stream.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) {
+        %c0 = arith.constant 0 : index
+        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %3 = tensor.empty() : tensor<2xf32>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %5 = math.absf %in : f32
+          linalg.yield %5 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c64 = arith.constant 64 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c128} => !stream.timepoint
+    %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}, %result as %arg4: !stream.resource<external>{%c128}) {
+      stream.cmd.concurrent {
+        stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 {
+          ro %arg2[%c0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg4[%c0 for %c8] : !stream.resource<external>{%c128}
+        }
+        stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 {
+          ro %arg3[%c0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg4[%c64 for %c8] : !stream.resource<external>{%c128}
+        }
+      }
+    } => !stream.timepoint
+    %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c128}
+    %4 = stream.resource.subview %3[%c0] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+    %5 = stream.resource.subview %3[%c64] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+    %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    util.return %6, %7 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After IPOPass (iree-util-ipo) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  stream.executable private @multiple_results_dispatch_0 {
+    stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      stream.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) {
+        %c0 = arith.constant 0 : index
+        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %3 = tensor.empty() : tensor<2xf32>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %5 = math.absf %in : f32
+          linalg.yield %5 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c64 = arith.constant 64 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c128} => !stream.timepoint
+    %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}, %result as %arg4: !stream.resource<external>{%c128}) {
+      stream.cmd.concurrent {
+        stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 {
+          ro %arg2[%c0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg4[%c0 for %c8] : !stream.resource<external>{%c128}
+        }
+        stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 {
+          ro %arg3[%c0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg4[%c64 for %c8] : !stream.resource<external>{%c128}
+        }
+      }
+    } => !stream.timepoint
+    %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c128}
+    %4 = stream.resource.subview %3[%c0] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+    %5 = stream.resource.subview %3[%c64] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+    %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    util.return %6, %7 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After ReuseAllocationsPass (iree-stream-reuse-allocations) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c0 = arith.constant 0 : index
+  %c128 = arith.constant 128 : index
+  %c64 = arith.constant 64 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c128} => !stream.timepoint
+  %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}, %result as %arg4: !stream.resource<external>{%c128}) {
+    stream.cmd.concurrent {
+      stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 {
+        ro %arg2[%c0 for %c8] : !stream.resource<external>{%c8},
+        wo %arg4[%c0 for %c8] : !stream.resource<external>{%c128}
+      }
+      stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 {
+        ro %arg3[%c0 for %c8] : !stream.resource<external>{%c8},
+        wo %arg4[%c64 for %c8] : !stream.resource<external>{%c128}
+      }
+    }
+  } => !stream.timepoint
+  %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c128}
+  %4 = stream.resource.subview %3[%c0] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+  %5 = stream.resource.subview %3[%c64] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+  %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  util.return %6, %7 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After SCFToControlFlowPass (convert-scf-to-cf) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c0 = arith.constant 0 : index
+  %c128 = arith.constant 128 : index
+  %c64 = arith.constant 64 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c128} => !stream.timepoint
+  %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}, %result as %arg4: !stream.resource<external>{%c128}) {
+    stream.cmd.concurrent {
+      stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 {
+        ro %arg2[%c0 for %c8] : !stream.resource<external>{%c8},
+        wo %arg4[%c0 for %c8] : !stream.resource<external>{%c128}
+      }
+      stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 {
+        ro %arg3[%c0 for %c8] : !stream.resource<external>{%c8},
+        wo %arg4[%c64 for %c8] : !stream.resource<external>{%c128}
+      }
+    }
+  } => !stream.timepoint
+  %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c128}
+  %4 = stream.resource.subview %3[%c0] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+  %5 = stream.resource.subview %3[%c64] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+  %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  util.return %6, %7 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c0 = arith.constant 0 : index
+  %c128 = arith.constant 128 : index
+  %c64 = arith.constant 64 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c128} => !stream.timepoint
+  %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}, %result as %arg4: !stream.resource<external>{%c128}) {
+    stream.cmd.concurrent {
+      stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 {
+        ro %arg2[%c0 for %c8] : !stream.resource<external>{%c8},
+        wo %arg4[%c0 for %c8] : !stream.resource<external>{%c128}
+      }
+      stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 {
+        ro %arg3[%c0 for %c8] : !stream.resource<external>{%c8},
+        wo %arg4[%c64 for %c8] : !stream.resource<external>{%c128}
+      }
+    }
+  } => !stream.timepoint
+  %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c128}
+  %4 = stream.resource.subview %3[%c0] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+  %5 = stream.resource.subview %3[%c64] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+  %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  util.return %6, %7 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After CSE (cse) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c0 = arith.constant 0 : index
+  %c128 = arith.constant 128 : index
+  %c64 = arith.constant 64 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c128} => !stream.timepoint
+  %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}, %result as %arg4: !stream.resource<external>{%c128}) {
+    stream.cmd.concurrent {
+      stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 {
+        ro %arg2[%c0 for %c8] : !stream.resource<external>{%c8},
+        wo %arg4[%c0 for %c8] : !stream.resource<external>{%c128}
+      }
+      stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 {
+        ro %arg3[%c0 for %c8] : !stream.resource<external>{%c8},
+        wo %arg4[%c64 for %c8] : !stream.resource<external>{%c128}
+      }
+    }
+  } => !stream.timepoint
+  %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c128}
+  %4 = stream.resource.subview %3[%c0] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+  %5 = stream.resource.subview %3[%c64] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+  %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  util.return %6, %7 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c0 = arith.constant 0 : index
+  %c128 = arith.constant 128 : index
+  %c64 = arith.constant 64 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c128} => !stream.timepoint
+  %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}, %result as %arg4: !stream.resource<external>{%c128}) {
+    stream.cmd.concurrent {
+      stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 {
+        ro %arg2[%c0 for %c8] : !stream.resource<external>{%c8},
+        wo %arg4[%c0 for %c8] : !stream.resource<external>{%c128}
+      }
+      stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 {
+        ro %arg3[%c0 for %c8] : !stream.resource<external>{%c8},
+        wo %arg4[%c64 for %c8] : !stream.resource<external>{%c128}
+      }
+    }
+  } => !stream.timepoint
+  %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c128}
+  %4 = stream.resource.subview %3[%c0] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+  %5 = stream.resource.subview %3[%c64] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+  %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  util.return %6, %7 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c0 = arith.constant 0 : index
+  %c128 = arith.constant 128 : index
+  %c64 = arith.constant 64 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c128} => !stream.timepoint
+  %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}, %result as %arg4: !stream.resource<external>{%c128}) {
+    stream.cmd.concurrent {
+      stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 {
+        ro %arg2[%c0 for %c8] : !stream.resource<external>{%c8},
+        wo %arg4[%c0 for %c8] : !stream.resource<external>{%c128}
+      }
+      stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 {
+        ro %arg3[%c0 for %c8] : !stream.resource<external>{%c8},
+        wo %arg4[%c64 for %c8] : !stream.resource<external>{%c128}
+      }
+    }
+  } => !stream.timepoint
+  %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c128}
+  %4 = stream.resource.subview %3[%c0] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+  %5 = stream.resource.subview %3[%c64] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+  %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  util.return %6, %7 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c0 = arith.constant 0 : index
+  %c128 = arith.constant 128 : index
+  %c64 = arith.constant 64 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c128} => !stream.timepoint
+  %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}, %result as %arg4: !stream.resource<external>{%c128}) {
+    stream.cmd.concurrent {
+      stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 {
+        ro %arg2[%c0 for %c8] : !stream.resource<external>{%c8},
+        wo %arg4[%c0 for %c8] : !stream.resource<external>{%c128}
+      }
+      stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 {
+        ro %arg3[%c0 for %c8] : !stream.resource<external>{%c8},
+        wo %arg4[%c64 for %c8] : !stream.resource<external>{%c128}
+      }
+    }
+  } => !stream.timepoint
+  %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c128}
+  %4 = stream.resource.subview %3[%c0] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+  %5 = stream.resource.subview %3[%c64] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+  %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  util.return %6, %7 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  stream.executable private @multiple_results_dispatch_0 {
+    stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      stream.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) {
+        %c0 = arith.constant 0 : index
+        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %3 = tensor.empty() : tensor<2xf32>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %5 = math.absf %in : f32
+          linalg.yield %5 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c64 = arith.constant 64 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c128} => !stream.timepoint
+    %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}, %result as %arg4: !stream.resource<external>{%c128}) {
+      stream.cmd.concurrent {
+        stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 {
+          ro %arg2[%c0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg4[%c0 for %c8] : !stream.resource<external>{%c128}
+        }
+        stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 {
+          ro %arg3[%c0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg4[%c64 for %c8] : !stream.resource<external>{%c128}
+        }
+      }
+    } => !stream.timepoint
+    %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c128}
+    %4 = stream.resource.subview %3[%c0] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+    %5 = stream.resource.subview %3[%c64] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+    %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    util.return %6, %7 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  stream.executable private @multiple_results_dispatch_0 {
+    stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      stream.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) {
+        %c0 = arith.constant 0 : index
+        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %3 = tensor.empty() : tensor<2xf32>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %5 = math.absf %in : f32
+          linalg.yield %5 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c64 = arith.constant 64 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c128} => !stream.timepoint
+    %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}, %result as %arg4: !stream.resource<external>{%c128}) {
+      stream.cmd.concurrent {
+        stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 {
+          ro %arg2[%c0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg4[%c0 for %c8] : !stream.resource<external>{%c128}
+        }
+        stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 {
+          ro %arg3[%c0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg4[%c64 for %c8] : !stream.resource<external>{%c128}
+        }
+      }
+    } => !stream.timepoint
+    %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c128}
+    %4 = stream.resource.subview %3[%c0] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+    %5 = stream.resource.subview %3[%c64] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+    %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    util.return %6, %7 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After IPOPass (iree-util-ipo) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  stream.executable private @multiple_results_dispatch_0 {
+    stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      stream.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) {
+        %c0 = arith.constant 0 : index
+        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %3 = tensor.empty() : tensor<2xf32>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %5 = math.absf %in : f32
+          linalg.yield %5 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c64 = arith.constant 64 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c128} => !stream.timepoint
+    %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}, %result as %arg4: !stream.resource<external>{%c128}) {
+      stream.cmd.concurrent {
+        stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 {
+          ro %arg2[%c0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg4[%c0 for %c8] : !stream.resource<external>{%c128}
+        }
+        stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 {
+          ro %arg3[%c0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg4[%c64 for %c8] : !stream.resource<external>{%c128}
+        }
+      }
+    } => !stream.timepoint
+    %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c128}
+    %4 = stream.resource.subview %3[%c0] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+    %5 = stream.resource.subview %3[%c64] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+    %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    util.return %6, %7 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After ElideTimepointsPass (iree-stream-elide-timepoints) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  stream.executable private @multiple_results_dispatch_0 {
+    stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      stream.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) {
+        %c0 = arith.constant 0 : index
+        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %3 = tensor.empty() : tensor<2xf32>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %5 = math.absf %in : f32
+          linalg.yield %5 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c64 = arith.constant 64 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c128} => !stream.timepoint
+    %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}, %result as %arg4: !stream.resource<external>{%c128}) {
+      stream.cmd.concurrent {
+        stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 {
+          ro %arg2[%c0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg4[%c0 for %c8] : !stream.resource<external>{%c128}
+        }
+        stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 {
+          ro %arg3[%c0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg4[%c64 for %c8] : !stream.resource<external>{%c128}
+        }
+      }
+    } => !stream.timepoint
+    %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c128}
+    %4 = stream.resource.subview %3[%c0] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+    %5 = stream.resource.subview %3[%c64] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+    %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    util.return %6, %7 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After FixedPointIteratorPass (iree-util-fixed-point-iterator) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  stream.executable private @multiple_results_dispatch_0 {
+    stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      stream.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) {
+        %c0 = arith.constant 0 : index
+        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %3 = tensor.empty() : tensor<2xf32>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %5 = math.absf %in : f32
+          linalg.yield %5 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c64 = arith.constant 64 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c128} => !stream.timepoint
+    %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}, %result as %arg4: !stream.resource<external>{%c128}) {
+      stream.cmd.concurrent {
+        stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 {
+          ro %arg2[%c0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg4[%c0 for %c8] : !stream.resource<external>{%c128}
+        }
+        stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 {
+          ro %arg3[%c0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg4[%c64 for %c8] : !stream.resource<external>{%c128}
+        }
+      }
+    } => !stream.timepoint
+    %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c128}
+    %4 = stream.resource.subview %3[%c0] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+    %5 = stream.resource.subview %3[%c64] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+    %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    util.return %6, %7 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After FuseDispatchBindingsPass (iree-stream-fuse-dispatch-bindings) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  stream.executable private @multiple_results_dispatch_0 {
+    stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      stream.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index) {
+        %c0 = arith.constant 0 : index
+        %0 = stream.binding.subspan %arg0[%arg2] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+        %1 = stream.binding.subspan %arg1[%arg3] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %3 = tensor.empty() : tensor<2xf32>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %5 = math.absf %in : f32
+          linalg.yield %5 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c64 = arith.constant 64 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c128} => !stream.timepoint
+    %c0_0 = arith.constant 0 : index
+    %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}, %result as %arg4: !stream.resource<external>{%c128}) {
+      stream.cmd.concurrent {
+        stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0, %c0 : index, index) {
+          ro %arg2[%c0_0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg4[%c0_0 for %c128] : !stream.resource<external>{%c128}
+        }
+        stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0, %c64 : index, index) {
+          ro %arg3[%c0_0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg4[%c0_0 for %c128] : !stream.resource<external>{%c128}
+        }
+      }
+    } => !stream.timepoint
+    %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c128}
+    %4 = stream.resource.subview %3[%c0] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+    %5 = stream.resource.subview %3[%c64] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+    %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    util.return %6, %7 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After AnnotateDispatchArgumentsPass (iree-stream-annotate-dispatch-arguments) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  stream.executable private @multiple_results_dispatch_0 {
+    stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      stream.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: index {stream.values = [0 : index]}, %arg3: index {stream.alignment = 64 : index, stream.values = [0 : index, 64 : index]}) {
+        %c0 = arith.constant 0 : index
+        %0 = stream.binding.subspan %arg0[%arg2] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+        %1 = stream.binding.subspan %arg1[%arg3] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %3 = tensor.empty() : tensor<2xf32>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %5 = math.absf %in : f32
+          linalg.yield %5 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c64 = arith.constant 64 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c128} => !stream.timepoint
+    %c0_0 = arith.constant 0 : index
+    %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}, %result as %arg4: !stream.resource<external>{%c128}) {
+      stream.cmd.concurrent {
+        stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0, %c0 : index, index) {
+          ro %arg2[%c0_0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg4[%c0_0 for %c128] : !stream.resource<external>{%c128}
+        }
+        stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0, %c64 : index, index) {
+          ro %arg3[%c0_0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg4[%c0_0 for %c128] : !stream.resource<external>{%c128}
+        }
+      }
+    } => !stream.timepoint
+    %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c128}
+    %4 = stream.resource.subview %3[%c0] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+    %5 = stream.resource.subview %3[%c64] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+    %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    util.return %6, %7 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After AnnotateDispatchAssumptionsPass (iree-stream-annotate-dispatch-assumptions) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  stream.executable private @multiple_results_dispatch_0 {
+    stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      stream.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: index {stream.values = [0 : index]}, %arg3: index {stream.alignment = 64 : index, stream.values = [0 : index, 64 : index]}) {
+        %0:2 = util.assume.int 
+            %arg2[<umin = 0, umax = 0>, <umin = 0, umax = 0>], 
+            %arg3[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>]
+          : index, index
+        %c0 = arith.constant 0 : index
+        %1 = stream.binding.subspan %arg0[%0#0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+        %2 = stream.binding.subspan %arg1[%0#1] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        %3 = iree_tensor_ext.dispatch.tensor.load %1, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %4 = tensor.empty() : tensor<2xf32>
+        %5 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%4 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %6 = math.absf %in : f32
+          linalg.yield %6 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %5, %2, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c64 = arith.constant 64 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c128} => !stream.timepoint
+    %c0_0 = arith.constant 0 : index
+    %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}, %result as %arg4: !stream.resource<external>{%c128}) {
+      stream.cmd.concurrent {
+        stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0, %c0 : index, index) {
+          ro %arg2[%c0_0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg4[%c0_0 for %c128] : !stream.resource<external>{%c128}
+        }
+        stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0, %c64 : index, index) {
+          ro %arg3[%c0_0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg4[%c0_0 for %c128] : !stream.resource<external>{%c128}
+        }
+      }
+    } => !stream.timepoint
+    %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c128}
+    %4 = stream.resource.subview %3[%c0] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+    %5 = stream.resource.subview %3[%c64] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+    %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    util.return %6, %7 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After PackDispatchOperandsPass (iree-stream-pack-dispatch-operands) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  stream.executable private @multiple_results_dispatch_0 {
+    stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      stream.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32) {
+        %0 = arith.extui %arg2 : i32 to i64
+        %1 = arith.extui %arg3 : i32 to i64
+        %c32_i64 = arith.constant 32 : i64
+        %2 = arith.shli %1, %c32_i64 : i64
+        %3 = arith.ori %0, %2 : i64
+        %4 = arith.index_castui %3 {stream.values = [0 : index]} : i64 to index
+        %5 = arith.extui %arg4 : i32 to i64
+        %6 = arith.extui %arg5 : i32 to i64
+        %c32_i64_0 = arith.constant 32 : i64
+        %7 = arith.shli %6, %c32_i64_0 : i64
+        %8 = arith.ori %5, %7 : i64
+        %9 = arith.index_castui %8 {stream.alignment = 64 : index, stream.values = [0 : index, 64 : index]} : i64 to index
+        %10:2 = util.assume.int 
+            %4[<umin = 0, umax = 0>, <umin = 0, umax = 0>], 
+            %9[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>]
+          : index, index
+        %c0 = arith.constant 0 : index
+        %11 = stream.binding.subspan %arg0[%10#0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+        %12 = stream.binding.subspan %arg1[%10#1] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        %13 = iree_tensor_ext.dispatch.tensor.load %11, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %14 = tensor.empty() : tensor<2xf32>
+        %15 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%13 : tensor<2xf32>) outs(%14 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %16 = math.absf %in : f32
+          linalg.yield %16 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %15, %12, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c64 = arith.constant 64 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c128} => !stream.timepoint
+    %c0_0 = arith.constant 0 : index
+    %c0_i64 = arith.constant 0 : i64
+    %c0_i32 = arith.constant 0 : i32
+    %c32_i64 = arith.constant 32 : i64
+    %c0_i64_1 = arith.constant 0 : i64
+    %c0_i32_2 = arith.constant 0 : i32
+    %c0_i64_3 = arith.constant 0 : i64
+    %c0_i32_4 = arith.constant 0 : i32
+    %c32_i64_5 = arith.constant 32 : i64
+    %c0_i64_6 = arith.constant 0 : i64
+    %c0_i32_7 = arith.constant 0 : i32
+    %c0_i64_8 = arith.constant 0 : i64
+    %c0_i32_9 = arith.constant 0 : i32
+    %c32_i64_10 = arith.constant 32 : i64
+    %c0_i64_11 = arith.constant 0 : i64
+    %c0_i32_12 = arith.constant 0 : i32
+    %c64_i64 = arith.constant 64 : i64
+    %c64_i32 = arith.constant 64 : i32
+    %c32_i64_13 = arith.constant 32 : i64
+    %c0_i64_14 = arith.constant 0 : i64
+    %c0_i32_15 = arith.constant 0 : i32
+    %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}, %result as %arg4: !stream.resource<external>{%c128}) {
+      stream.cmd.concurrent {
+        stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32, %c0_i32_2, %c0_i32_4, %c0_i32_7 : i32, i32, i32, i32) {
+          ro %arg2[%c0_0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg4[%c0_0 for %c128] : !stream.resource<external>{%c128}
+        }
+        stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32_9, %c0_i32_12, %c64_i32, %c0_i32_15 : i32, i32, i32, i32) {
+          ro %arg3[%c0_0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg4[%c0_0 for %c128] : !stream.resource<external>{%c128}
+        }
+      }
+    } => !stream.timepoint
+    %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c128}
+    %4 = stream.resource.subview %3[%c0] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+    %5 = stream.resource.subview %3[%c64] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+    %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    util.return %6, %7 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c64_i32 = arith.constant 64 : i32
+  %c0_i32 = arith.constant 0 : i32
+  %c0 = arith.constant 0 : index
+  %c128 = arith.constant 128 : index
+  %c64 = arith.constant 64 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c128} => !stream.timepoint
+  %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}, %result as %arg4: !stream.resource<external>{%c128}) {
+    stream.cmd.concurrent {
+      stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32, %c0_i32, %c0_i32, %c0_i32 : i32, i32, i32, i32) {
+        ro %arg2[%c0 for %c8] : !stream.resource<external>{%c8},
+        wo %arg4[%c0 for %c128] : !stream.resource<external>{%c128}
+      }
+      stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32, %c0_i32, %c64_i32, %c0_i32 : i32, i32, i32, i32) {
+        ro %arg3[%c0 for %c8] : !stream.resource<external>{%c8},
+        wo %arg4[%c0 for %c128] : !stream.resource<external>{%c128}
+      }
+    }
+  } => !stream.timepoint
+  %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c128}
+  %4 = stream.resource.subview %3[%c0] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+  %5 = stream.resource.subview %3[%c64] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+  %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  util.return %6, %7 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After CSE (cse) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c64_i32 = arith.constant 64 : i32
+  %c0_i32 = arith.constant 0 : i32
+  %c0 = arith.constant 0 : index
+  %c128 = arith.constant 128 : index
+  %c64 = arith.constant 64 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c128} => !stream.timepoint
+  %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}, %result as %arg4: !stream.resource<external>{%c128}) {
+    stream.cmd.concurrent {
+      stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32, %c0_i32, %c0_i32, %c0_i32 : i32, i32, i32, i32) {
+        ro %arg2[%c0 for %c8] : !stream.resource<external>{%c8},
+        wo %arg4[%c0 for %c128] : !stream.resource<external>{%c128}
+      }
+      stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32, %c0_i32, %c64_i32, %c0_i32 : i32, i32, i32, i32) {
+        ro %arg3[%c0 for %c8] : !stream.resource<external>{%c8},
+        wo %arg4[%c0 for %c128] : !stream.resource<external>{%c128}
+      }
+    }
+  } => !stream.timepoint
+  %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c128}
+  %4 = stream.resource.subview %3[%c0] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+  %5 = stream.resource.subview %3[%c64] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+  %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  util.return %6, %7 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c64_i32 = arith.constant 64 : i32
+  %c0_i32 = arith.constant 0 : i32
+  %c0 = arith.constant 0 : index
+  %c128 = arith.constant 128 : index
+  %c64 = arith.constant 64 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c128} => !stream.timepoint
+  %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}, %result as %arg4: !stream.resource<external>{%c128}) {
+    stream.cmd.concurrent {
+      stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32, %c0_i32, %c0_i32, %c0_i32 : i32, i32, i32, i32) {
+        ro %arg2[%c0 for %c8] : !stream.resource<external>{%c8},
+        wo %arg4[%c0 for %c128] : !stream.resource<external>{%c128}
+      }
+      stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32, %c0_i32, %c64_i32, %c0_i32 : i32, i32, i32, i32) {
+        ro %arg3[%c0 for %c8] : !stream.resource<external>{%c8},
+        wo %arg4[%c0 for %c128] : !stream.resource<external>{%c128}
+      }
+    }
+  } => !stream.timepoint
+  %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c128}
+  %4 = stream.resource.subview %3[%c0] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+  %5 = stream.resource.subview %3[%c64] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+  %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  util.return %6, %7 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c64_i32 = arith.constant 64 : i32
+  %c0_i32 = arith.constant 0 : i32
+  %c0 = arith.constant 0 : index
+  %c128 = arith.constant 128 : index
+  %c64 = arith.constant 64 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c128} => !stream.timepoint
+  %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}, %result as %arg4: !stream.resource<external>{%c128}) {
+    stream.cmd.concurrent {
+      stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32, %c0_i32, %c0_i32, %c0_i32 : i32, i32, i32, i32) {
+        ro %arg2[%c0 for %c8] : !stream.resource<external>{%c8},
+        wo %arg4[%c0 for %c128] : !stream.resource<external>{%c128}
+      }
+      stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32, %c0_i32, %c64_i32, %c0_i32 : i32, i32, i32, i32) {
+        ro %arg3[%c0 for %c8] : !stream.resource<external>{%c8},
+        wo %arg4[%c0 for %c128] : !stream.resource<external>{%c128}
+      }
+    }
+  } => !stream.timepoint
+  %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c128}
+  %4 = stream.resource.subview %3[%c0] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+  %5 = stream.resource.subview %3[%c64] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+  %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  util.return %6, %7 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c64_i32 = arith.constant 64 : i32
+  %c0_i32 = arith.constant 0 : i32
+  %c0 = arith.constant 0 : index
+  %c128 = arith.constant 128 : index
+  %c64 = arith.constant 64 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c128} => !stream.timepoint
+  %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}, %result as %arg4: !stream.resource<external>{%c128}) {
+    stream.cmd.concurrent {
+      stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32, %c0_i32, %c0_i32, %c0_i32 : i32, i32, i32, i32) {
+        ro %arg2[%c0 for %c8] : !stream.resource<external>{%c8},
+        wo %arg4[%c0 for %c128] : !stream.resource<external>{%c128}
+      }
+      stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32, %c0_i32, %c64_i32, %c0_i32 : i32, i32, i32, i32) {
+        ro %arg3[%c0 for %c8] : !stream.resource<external>{%c8},
+        wo %arg4[%c0 for %c128] : !stream.resource<external>{%c128}
+      }
+    }
+  } => !stream.timepoint
+  %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c128}
+  %4 = stream.resource.subview %3[%c0] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+  %5 = stream.resource.subview %3[%c64] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+  %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  util.return %6, %7 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  stream.executable private @multiple_results_dispatch_0 {
+    stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      stream.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32) {
+        %c0 = arith.constant 0 : index
+        %c32_i64 = arith.constant 32 : i64
+        %0 = arith.extui %arg4 : i32 to i64
+        %1 = arith.extui %arg5 : i32 to i64
+        %2 = arith.shli %1, %c32_i64 : i64
+        %3 = arith.ori %0, %2 : i64
+        %4 = arith.index_castui %3 {stream.alignment = 64 : index, stream.values = [0 : index, 64 : index]} : i64 to index
+        %5 = util.assume.int %4[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>] : index
+        %6 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+        %7 = stream.binding.subspan %arg1[%5] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        %8 = iree_tensor_ext.dispatch.tensor.load %6, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %9 = tensor.empty() : tensor<2xf32>
+        %10 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%8 : tensor<2xf32>) outs(%9 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %11 = math.absf %in : f32
+          linalg.yield %11 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %10, %7, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c64_i32 = arith.constant 64 : i32
+    %c0_i32 = arith.constant 0 : i32
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c64 = arith.constant 64 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c128} => !stream.timepoint
+    %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}, %result as %arg4: !stream.resource<external>{%c128}) {
+      stream.cmd.concurrent {
+        stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32, %c0_i32, %c0_i32, %c0_i32 : i32, i32, i32, i32) {
+          ro %arg2[%c0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg4[%c0 for %c128] : !stream.resource<external>{%c128}
+        }
+        stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32, %c0_i32, %c64_i32, %c0_i32 : i32, i32, i32, i32) {
+          ro %arg3[%c0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg4[%c0 for %c128] : !stream.resource<external>{%c128}
+        }
+      }
+    } => !stream.timepoint
+    %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c128}
+    %4 = stream.resource.subview %3[%c0] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+    %5 = stream.resource.subview %3[%c64] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+    %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    util.return %6, %7 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  stream.executable private @multiple_results_dispatch_0 {
+    stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      stream.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32) {
+        %c0 = arith.constant 0 : index
+        %c32_i64 = arith.constant 32 : i64
+        %0 = arith.extui %arg4 : i32 to i64
+        %1 = arith.extui %arg5 : i32 to i64
+        %2 = arith.shli %1, %c32_i64 : i64
+        %3 = arith.ori %0, %2 : i64
+        %4 = arith.index_castui %3 {stream.alignment = 64 : index, stream.values = [0 : index, 64 : index]} : i64 to index
+        %5 = util.assume.int %4[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>] : index
+        %6 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+        %7 = stream.binding.subspan %arg1[%5] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        %8 = iree_tensor_ext.dispatch.tensor.load %6, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %9 = tensor.empty() : tensor<2xf32>
+        %10 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%8 : tensor<2xf32>) outs(%9 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %11 = math.absf %in : f32
+          linalg.yield %11 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %10, %7, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c64_i32 = arith.constant 64 : i32
+    %c0_i32 = arith.constant 0 : i32
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c64 = arith.constant 64 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c128} => !stream.timepoint
+    %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}, %result as %arg4: !stream.resource<external>{%c128}) {
+      stream.cmd.concurrent {
+        stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32, %c0_i32, %c0_i32, %c0_i32 : i32, i32, i32, i32) {
+          ro %arg2[%c0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg4[%c0 for %c128] : !stream.resource<external>{%c128}
+        }
+        stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32, %c0_i32, %c64_i32, %c0_i32 : i32, i32, i32, i32) {
+          ro %arg3[%c0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg4[%c0 for %c128] : !stream.resource<external>{%c128}
+        }
+      }
+    } => !stream.timepoint
+    %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c128}
+    %4 = stream.resource.subview %3[%c0] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+    %5 = stream.resource.subview %3[%c64] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+    %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    util.return %6, %7 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After IPOPass (iree-util-ipo) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  stream.executable private @multiple_results_dispatch_0 {
+    stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      stream.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32) {
+        %c0 = arith.constant 0 : index
+        %c32_i64 = arith.constant 32 : i64
+        %0 = arith.extui %arg4 : i32 to i64
+        %1 = arith.extui %arg5 : i32 to i64
+        %2 = arith.shli %1, %c32_i64 : i64
+        %3 = arith.ori %0, %2 : i64
+        %4 = arith.index_castui %3 {stream.alignment = 64 : index, stream.values = [0 : index, 64 : index]} : i64 to index
+        %5 = util.assume.int %4[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>] : index
+        %6 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+        %7 = stream.binding.subspan %arg1[%5] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        %8 = iree_tensor_ext.dispatch.tensor.load %6, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %9 = tensor.empty() : tensor<2xf32>
+        %10 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%8 : tensor<2xf32>) outs(%9 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %11 = math.absf %in : f32
+          linalg.yield %11 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %10, %7, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c64_i32 = arith.constant 64 : i32
+    %c0_i32 = arith.constant 0 : i32
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c64 = arith.constant 64 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c128} => !stream.timepoint
+    %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}, %result as %arg4: !stream.resource<external>{%c128}) {
+      stream.cmd.concurrent {
+        stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32, %c0_i32, %c0_i32, %c0_i32 : i32, i32, i32, i32) {
+          ro %arg2[%c0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg4[%c0 for %c128] : !stream.resource<external>{%c128}
+        }
+        stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32, %c0_i32, %c64_i32, %c0_i32 : i32, i32, i32, i32) {
+          ro %arg3[%c0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg4[%c0 for %c128] : !stream.resource<external>{%c128}
+        }
+      }
+    } => !stream.timepoint
+    %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c128}
+    %4 = stream.resource.subview %3[%c0] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+    %5 = stream.resource.subview %3[%c64] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+    %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    util.return %6, %7 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After FoldUniformOperandsPass (iree-stream-fold-uniform-operands) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  stream.executable private @multiple_results_dispatch_0 {
+    stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      stream.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: i32) {
+        %c0_i32 = arith.constant 0 : i32
+        %c0 = arith.constant 0 : index
+        %c32_i64 = arith.constant 32 : i64
+        %0 = arith.extui %arg2 : i32 to i64
+        %1 = arith.extui %c0_i32 : i32 to i64
+        %2 = arith.shli %1, %c32_i64 : i64
+        %3 = arith.ori %0, %2 : i64
+        %4 = arith.index_castui %3 {stream.alignment = 64 : index, stream.values = [0 : index, 64 : index]} : i64 to index
+        %5 = util.assume.int %4[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>] : index
+        %6 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+        %7 = stream.binding.subspan %arg1[%5] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        %8 = iree_tensor_ext.dispatch.tensor.load %6, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %9 = tensor.empty() : tensor<2xf32>
+        %10 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%8 : tensor<2xf32>) outs(%9 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %11 = math.absf %in : f32
+          linalg.yield %11 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %10, %7, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c64_i32 = arith.constant 64 : i32
+    %c0_i32 = arith.constant 0 : i32
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c64 = arith.constant 64 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c128} => !stream.timepoint
+    %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}, %result as %arg4: !stream.resource<external>{%c128}) {
+      stream.cmd.concurrent {
+        stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32 : i32) {
+          ro %arg2[%c0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg4[%c0 for %c128] : !stream.resource<external>{%c128}
+        }
+        stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c64_i32 : i32) {
+          ro %arg3[%c0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg4[%c0 for %c128] : !stream.resource<external>{%c128}
+        }
+      }
+    } => !stream.timepoint
+    %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c128}
+    %4 = stream.resource.subview %3[%c0] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+    %5 = stream.resource.subview %3[%c64] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+    %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    util.return %6, %7 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c64_i32 = arith.constant 64 : i32
+  %c0_i32 = arith.constant 0 : i32
+  %c0 = arith.constant 0 : index
+  %c128 = arith.constant 128 : index
+  %c64 = arith.constant 64 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c128} => !stream.timepoint
+  %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}, %result as %arg4: !stream.resource<external>{%c128}) {
+    stream.cmd.concurrent {
+      stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32 : i32) {
+        ro %arg2[%c0 for %c8] : !stream.resource<external>{%c8},
+        wo %arg4[%c0 for %c128] : !stream.resource<external>{%c128}
+      }
+      stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c64_i32 : i32) {
+        ro %arg3[%c0 for %c8] : !stream.resource<external>{%c8},
+        wo %arg4[%c0 for %c128] : !stream.resource<external>{%c128}
+      }
+    }
+  } => !stream.timepoint
+  %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c128}
+  %4 = stream.resource.subview %3[%c0] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+  %5 = stream.resource.subview %3[%c64] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+  %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  util.return %6, %7 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After CSE (cse) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c64_i32 = arith.constant 64 : i32
+  %c0_i32 = arith.constant 0 : i32
+  %c0 = arith.constant 0 : index
+  %c128 = arith.constant 128 : index
+  %c64 = arith.constant 64 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c128} => !stream.timepoint
+  %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}, %result as %arg4: !stream.resource<external>{%c128}) {
+    stream.cmd.concurrent {
+      stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32 : i32) {
+        ro %arg2[%c0 for %c8] : !stream.resource<external>{%c8},
+        wo %arg4[%c0 for %c128] : !stream.resource<external>{%c128}
+      }
+      stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c64_i32 : i32) {
+        ro %arg3[%c0 for %c8] : !stream.resource<external>{%c8},
+        wo %arg4[%c0 for %c128] : !stream.resource<external>{%c128}
+      }
+    }
+  } => !stream.timepoint
+  %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c128}
+  %4 = stream.resource.subview %3[%c0] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+  %5 = stream.resource.subview %3[%c64] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+  %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  util.return %6, %7 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c64_i32 = arith.constant 64 : i32
+  %c0_i32 = arith.constant 0 : i32
+  %c0 = arith.constant 0 : index
+  %c128 = arith.constant 128 : index
+  %c64 = arith.constant 64 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c128} => !stream.timepoint
+  %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}, %result as %arg4: !stream.resource<external>{%c128}) {
+    stream.cmd.concurrent {
+      stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32 : i32) {
+        ro %arg2[%c0 for %c8] : !stream.resource<external>{%c8},
+        wo %arg4[%c0 for %c128] : !stream.resource<external>{%c128}
+      }
+      stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c64_i32 : i32) {
+        ro %arg3[%c0 for %c8] : !stream.resource<external>{%c8},
+        wo %arg4[%c0 for %c128] : !stream.resource<external>{%c128}
+      }
+    }
+  } => !stream.timepoint
+  %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c128}
+  %4 = stream.resource.subview %3[%c0] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+  %5 = stream.resource.subview %3[%c64] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+  %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  util.return %6, %7 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c64_i32 = arith.constant 64 : i32
+  %c0_i32 = arith.constant 0 : i32
+  %c0 = arith.constant 0 : index
+  %c128 = arith.constant 128 : index
+  %c64 = arith.constant 64 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c128} => !stream.timepoint
+  %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}, %result as %arg4: !stream.resource<external>{%c128}) {
+    stream.cmd.concurrent {
+      stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32 : i32) {
+        ro %arg2[%c0 for %c8] : !stream.resource<external>{%c8},
+        wo %arg4[%c0 for %c128] : !stream.resource<external>{%c128}
+      }
+      stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c64_i32 : i32) {
+        ro %arg3[%c0 for %c8] : !stream.resource<external>{%c8},
+        wo %arg4[%c0 for %c128] : !stream.resource<external>{%c128}
+      }
+    }
+  } => !stream.timepoint
+  %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c128}
+  %4 = stream.resource.subview %3[%c0] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+  %5 = stream.resource.subview %3[%c64] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+  %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  util.return %6, %7 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c64_i32 = arith.constant 64 : i32
+  %c0_i32 = arith.constant 0 : i32
+  %c0 = arith.constant 0 : index
+  %c128 = arith.constant 128 : index
+  %c64 = arith.constant 64 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c128} => !stream.timepoint
+  %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}, %result as %arg4: !stream.resource<external>{%c128}) {
+    stream.cmd.concurrent {
+      stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32 : i32) {
+        ro %arg2[%c0 for %c8] : !stream.resource<external>{%c8},
+        wo %arg4[%c0 for %c128] : !stream.resource<external>{%c128}
+      }
+      stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c64_i32 : i32) {
+        ro %arg3[%c0 for %c8] : !stream.resource<external>{%c8},
+        wo %arg4[%c0 for %c128] : !stream.resource<external>{%c128}
+      }
+    }
+  } => !stream.timepoint
+  %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c128}
+  %4 = stream.resource.subview %3[%c0] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+  %5 = stream.resource.subview %3[%c64] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+  %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  util.return %6, %7 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  stream.executable private @multiple_results_dispatch_0 {
+    stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      stream.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: i32) {
+        %c0 = arith.constant 0 : index
+        %0 = arith.index_castui %arg2 : i32 to index
+        %1 = util.assume.int %0[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>] : index
+        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+        %3 = stream.binding.subspan %arg1[%1] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        %4 = iree_tensor_ext.dispatch.tensor.load %2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %5 = tensor.empty() : tensor<2xf32>
+        %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%4 : tensor<2xf32>) outs(%5 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %7 = math.absf %in : f32
+          linalg.yield %7 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %6, %3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c64_i32 = arith.constant 64 : i32
+    %c0_i32 = arith.constant 0 : i32
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c64 = arith.constant 64 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c128} => !stream.timepoint
+    %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}, %result as %arg4: !stream.resource<external>{%c128}) {
+      stream.cmd.concurrent {
+        stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32 : i32) {
+          ro %arg2[%c0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg4[%c0 for %c128] : !stream.resource<external>{%c128}
+        }
+        stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c64_i32 : i32) {
+          ro %arg3[%c0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg4[%c0 for %c128] : !stream.resource<external>{%c128}
+        }
+      }
+    } => !stream.timepoint
+    %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c128}
+    %4 = stream.resource.subview %3[%c0] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+    %5 = stream.resource.subview %3[%c64] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+    %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    util.return %6, %7 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  stream.executable private @multiple_results_dispatch_0 {
+    stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      stream.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: i32) {
+        %c0 = arith.constant 0 : index
+        %0 = arith.index_castui %arg2 : i32 to index
+        %1 = util.assume.int %0[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>] : index
+        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+        %3 = stream.binding.subspan %arg1[%1] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        %4 = iree_tensor_ext.dispatch.tensor.load %2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %5 = tensor.empty() : tensor<2xf32>
+        %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%4 : tensor<2xf32>) outs(%5 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %7 = math.absf %in : f32
+          linalg.yield %7 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %6, %3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c64_i32 = arith.constant 64 : i32
+    %c0_i32 = arith.constant 0 : i32
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c64 = arith.constant 64 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c128} => !stream.timepoint
+    %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}, %result as %arg4: !stream.resource<external>{%c128}) {
+      stream.cmd.concurrent {
+        stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32 : i32) {
+          ro %arg2[%c0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg4[%c0 for %c128] : !stream.resource<external>{%c128}
+        }
+        stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c64_i32 : i32) {
+          ro %arg3[%c0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg4[%c0 for %c128] : !stream.resource<external>{%c128}
+        }
+      }
+    } => !stream.timepoint
+    %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c128}
+    %4 = stream.resource.subview %3[%c0] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+    %5 = stream.resource.subview %3[%c64] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+    %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    util.return %6, %7 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After IPOPass (iree-util-ipo) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  stream.executable private @multiple_results_dispatch_0 {
+    stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      stream.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: i32) {
+        %c0 = arith.constant 0 : index
+        %0 = arith.index_castui %arg2 : i32 to index
+        %1 = util.assume.int %0[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>] : index
+        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+        %3 = stream.binding.subspan %arg1[%1] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        %4 = iree_tensor_ext.dispatch.tensor.load %2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %5 = tensor.empty() : tensor<2xf32>
+        %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%4 : tensor<2xf32>) outs(%5 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %7 = math.absf %in : f32
+          linalg.yield %7 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %6, %3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c64_i32 = arith.constant 64 : i32
+    %c0_i32 = arith.constant 0 : i32
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c64 = arith.constant 64 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c128} => !stream.timepoint
+    %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}, %result as %arg4: !stream.resource<external>{%c128}) {
+      stream.cmd.concurrent {
+        stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32 : i32) {
+          ro %arg2[%c0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg4[%c0 for %c128] : !stream.resource<external>{%c128}
+        }
+        stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c64_i32 : i32) {
+          ro %arg3[%c0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg4[%c0 for %c128] : !stream.resource<external>{%c128}
+        }
+      }
+    } => !stream.timepoint
+    %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c128}
+    %4 = stream.resource.subview %3[%c0] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+    %5 = stream.resource.subview %3[%c64] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+    %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    util.return %6, %7 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After SymbolDCE (symbol-dce) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  stream.executable private @multiple_results_dispatch_0 {
+    stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      stream.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: i32) {
+        %c0 = arith.constant 0 : index
+        %0 = arith.index_castui %arg2 : i32 to index
+        %1 = util.assume.int %0[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>] : index
+        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+        %3 = stream.binding.subspan %arg1[%1] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        %4 = iree_tensor_ext.dispatch.tensor.load %2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %5 = tensor.empty() : tensor<2xf32>
+        %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%4 : tensor<2xf32>) outs(%5 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %7 = math.absf %in : f32
+          linalg.yield %7 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %6, %3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c64_i32 = arith.constant 64 : i32
+    %c0_i32 = arith.constant 0 : i32
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c64 = arith.constant 64 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c128} => !stream.timepoint
+    %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}, %result as %arg4: !stream.resource<external>{%c128}) {
+      stream.cmd.concurrent {
+        stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32 : i32) {
+          ro %arg2[%c0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg4[%c0 for %c128] : !stream.resource<external>{%c128}
+        }
+        stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c64_i32 : i32) {
+          ro %arg3[%c0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg4[%c0 for %c128] : !stream.resource<external>{%c128}
+        }
+      }
+    } => !stream.timepoint
+    %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c128}
+    %4 = stream.resource.subview %3[%c0] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+    %5 = stream.resource.subview %3[%c64] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+    %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    util.return %6, %7 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After VerifyInitializationOrderPass (iree-util-verify-initialization-order) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  stream.executable private @multiple_results_dispatch_0 {
+    stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      stream.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: i32) {
+        %c0 = arith.constant 0 : index
+        %0 = arith.index_castui %arg2 : i32 to index
+        %1 = util.assume.int %0[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>] : index
+        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+        %3 = stream.binding.subspan %arg1[%1] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        %4 = iree_tensor_ext.dispatch.tensor.load %2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %5 = tensor.empty() : tensor<2xf32>
+        %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%4 : tensor<2xf32>) outs(%5 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %7 = math.absf %in : f32
+          linalg.yield %7 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %6, %3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c64_i32 = arith.constant 64 : i32
+    %c0_i32 = arith.constant 0 : i32
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c64 = arith.constant 64 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c128} => !stream.timepoint
+    %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}, %result as %arg4: !stream.resource<external>{%c128}) {
+      stream.cmd.concurrent {
+        stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32 : i32) {
+          ro %arg2[%c0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg4[%c0 for %c128] : !stream.resource<external>{%c128}
+        }
+        stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c64_i32 : i32) {
+          ro %arg3[%c0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg4[%c0 for %c128] : !stream.resource<external>{%c128}
+        }
+      }
+    } => !stream.timepoint
+    %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c128}
+    %4 = stream.resource.subview %3[%c0] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+    %5 = stream.resource.subview %3[%c64] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+    %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    util.return %6, %7 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After AttributeCallGraphPass (iree-util-attribute-call-graph) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  stream.executable private @multiple_results_dispatch_0 {
+    stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      stream.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: i32) {
+        %c0 = arith.constant 0 : index
+        %0 = arith.index_castui %arg2 : i32 to index
+        %1 = util.assume.int %0[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>] : index
+        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+        %3 = stream.binding.subspan %arg1[%1] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        %4 = iree_tensor_ext.dispatch.tensor.load %2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %5 = tensor.empty() : tensor<2xf32>
+        %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%4 : tensor<2xf32>) outs(%5 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %7 = math.absf %in : f32
+          linalg.yield %7 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %6, %3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c64_i32 = arith.constant 64 : i32
+    %c0_i32 = arith.constant 0 : i32
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c64 = arith.constant 64 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c128} => !stream.timepoint
+    %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}, %result as %arg4: !stream.resource<external>{%c128}) {
+      stream.cmd.concurrent {
+        stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32 : i32) {
+          ro %arg2[%c0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg4[%c0 for %c128] : !stream.resource<external>{%c128}
+        }
+        stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c64_i32 : i32) {
+          ro %arg3[%c0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg4[%c0 for %c128] : !stream.resource<external>{%c128}
+        }
+      }
+    } => !stream.timepoint
+    %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c128}
+    %4 = stream.resource.subview %3[%c0] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+    %5 = stream.resource.subview %3[%c64] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+    %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    util.return %6, %7 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After AssignLegacyTargetDevicesPass (iree-hal-assign-legacy-target-devices) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  stream.executable private @multiple_results_dispatch_0 {
+    stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      stream.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: i32) {
+        %c0 = arith.constant 0 : index
+        %0 = arith.index_castui %arg2 : i32 to index
+        %1 = util.assume.int %0[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>] : index
+        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+        %3 = stream.binding.subspan %arg1[%1] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        %4 = iree_tensor_ext.dispatch.tensor.load %2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %5 = tensor.empty() : tensor<2xf32>
+        %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%4 : tensor<2xf32>) outs(%5 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %7 = math.absf %in : f32
+          linalg.yield %7 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %6, %3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c64_i32 = arith.constant 64 : i32
+    %c0_i32 = arith.constant 0 : i32
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c64 = arith.constant 64 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c128} => !stream.timepoint
+    %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}, %result as %arg4: !stream.resource<external>{%c128}) {
+      stream.cmd.concurrent {
+        stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32 : i32) {
+          ro %arg2[%c0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg4[%c0 for %c128] : !stream.resource<external>{%c128}
+        }
+        stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c64_i32 : i32) {
+          ro %arg3[%c0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg4[%c0 for %c128] : !stream.resource<external>{%c128}
+        }
+      }
+    } => !stream.timepoint
+    %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c128}
+    %4 = stream.resource.subview %3[%c0] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+    %5 = stream.resource.subview %3[%c64] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+    %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    util.return %6, %7 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After MaterializeTargetDevicesPass (iree-hal-materialize-target-devices) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  stream.executable private @multiple_results_dispatch_0 {
+    stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      stream.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: i32) {
+        %c0 = arith.constant 0 : index
+        %0 = arith.index_castui %arg2 : i32 to index
+        %1 = util.assume.int %0[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>] : index
+        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+        %3 = stream.binding.subspan %arg1[%1] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        %4 = iree_tensor_ext.dispatch.tensor.load %2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %5 = tensor.empty() : tensor<2xf32>
+        %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%4 : tensor<2xf32>) outs(%5 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %7 = math.absf %in : f32
+          linalg.yield %7 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %6, %3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c64_i32 = arith.constant 64 : i32
+    %c0_i32 = arith.constant 0 : i32
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c64 = arith.constant 64 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c128} => !stream.timepoint
+    %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}, %result as %arg4: !stream.resource<external>{%c128}) {
+      stream.cmd.concurrent {
+        stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32 : i32) {
+          ro %arg2[%c0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg4[%c0 for %c128] : !stream.resource<external>{%c128}
+        }
+        stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c64_i32 : i32) {
+          ro %arg3[%c0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg4[%c0 for %c128] : !stream.resource<external>{%c128}
+        }
+      }
+    } => !stream.timepoint
+    %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c128}
+    %4 = stream.resource.subview %3[%c0] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+    %5 = stream.resource.subview %3[%c64] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+    %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    util.return %6, %7 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After ResolveDevicePromisesPass (iree-hal-resolve-device-promises) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  stream.executable private @multiple_results_dispatch_0 {
+    stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      stream.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: i32) {
+        %c0 = arith.constant 0 : index
+        %0 = arith.index_castui %arg2 : i32 to index
+        %1 = util.assume.int %0[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>] : index
+        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+        %3 = stream.binding.subspan %arg1[%1] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        %4 = iree_tensor_ext.dispatch.tensor.load %2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %5 = tensor.empty() : tensor<2xf32>
+        %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%4 : tensor<2xf32>) outs(%5 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %7 = math.absf %in : f32
+          linalg.yield %7 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %6, %3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c64_i32 = arith.constant 64 : i32
+    %c0_i32 = arith.constant 0 : i32
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c64 = arith.constant 64 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c128} => !stream.timepoint
+    %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}, %result as %arg4: !stream.resource<external>{%c128}) {
+      stream.cmd.concurrent {
+        stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32 : i32) {
+          ro %arg2[%c0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg4[%c0 for %c128] : !stream.resource<external>{%c128}
+        }
+        stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c64_i32 : i32) {
+          ro %arg3[%c0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg4[%c0 for %c128] : !stream.resource<external>{%c128}
+        }
+      }
+    } => !stream.timepoint
+    %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c128}
+    %4 = stream.resource.subview %3[%c0] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+    %5 = stream.resource.subview %3[%c64] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+    %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    util.return %6, %7 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After ResolveDeviceAliasesPass (iree-hal-resolve-device-aliases) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  stream.executable private @multiple_results_dispatch_0 {
+    stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      stream.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: i32) {
+        %c0 = arith.constant 0 : index
+        %0 = arith.index_castui %arg2 : i32 to index
+        %1 = util.assume.int %0[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>] : index
+        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+        %3 = stream.binding.subspan %arg1[%1] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        %4 = iree_tensor_ext.dispatch.tensor.load %2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %5 = tensor.empty() : tensor<2xf32>
+        %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%4 : tensor<2xf32>) outs(%5 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %7 = math.absf %in : f32
+          linalg.yield %7 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %6, %3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c64_i32 = arith.constant 64 : i32
+    %c0_i32 = arith.constant 0 : i32
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c64 = arith.constant 64 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c128} => !stream.timepoint
+    %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}, %result as %arg4: !stream.resource<external>{%c128}) {
+      stream.cmd.concurrent {
+        stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32 : i32) {
+          ro %arg2[%c0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg4[%c0 for %c128] : !stream.resource<external>{%c128}
+        }
+        stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c64_i32 : i32) {
+          ro %arg3[%c0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg4[%c0 for %c128] : !stream.resource<external>{%c128}
+        }
+      }
+    } => !stream.timepoint
+    %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c128}
+    %4 = stream.resource.subview %3[%c0] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+    %5 = stream.resource.subview %3[%c64] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+    %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    util.return %6, %7 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After VerifyDevicesPass (iree-hal-verify-devices) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  stream.executable private @multiple_results_dispatch_0 {
+    stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      stream.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: i32) {
+        %c0 = arith.constant 0 : index
+        %0 = arith.index_castui %arg2 : i32 to index
+        %1 = util.assume.int %0[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>] : index
+        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+        %3 = stream.binding.subspan %arg1[%1] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        %4 = iree_tensor_ext.dispatch.tensor.load %2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %5 = tensor.empty() : tensor<2xf32>
+        %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%4 : tensor<2xf32>) outs(%5 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %7 = math.absf %in : f32
+          linalg.yield %7 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %6, %3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c64_i32 = arith.constant 64 : i32
+    %c0_i32 = arith.constant 0 : i32
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c64 = arith.constant 64 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c128} => !stream.timepoint
+    %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}, %result as %arg4: !stream.resource<external>{%c128}) {
+      stream.cmd.concurrent {
+        stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32 : i32) {
+          ro %arg2[%c0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg4[%c0 for %c128] : !stream.resource<external>{%c128}
+        }
+        stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c64_i32 : i32) {
+          ro %arg3[%c0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg4[%c0 for %c128] : !stream.resource<external>{%c128}
+        }
+      }
+    } => !stream.timepoint
+    %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c128}
+    %4 = stream.resource.subview %3[%c0] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+    %5 = stream.resource.subview %3[%c64] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+    %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    util.return %6, %7 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c64_i32 = arith.constant 64 : i32
+  %c0_i32 = arith.constant 0 : i32
+  %c0 = arith.constant 0 : index
+  %c128 = arith.constant 128 : index
+  %c64 = arith.constant 64 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c128} => !stream.timepoint
+  %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}, %result as %arg4: !stream.resource<external>{%c128}) {
+    stream.cmd.concurrent {
+      stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32 : i32) {
+        ro %arg2[%c0 for %c8] : !stream.resource<external>{%c8},
+        wo %arg4[%c0 for %c128] : !stream.resource<external>{%c128}
+      }
+      stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c64_i32 : i32) {
+        ro %arg3[%c0 for %c8] : !stream.resource<external>{%c8},
+        wo %arg4[%c0 for %c128] : !stream.resource<external>{%c128}
+      }
+    }
+  } => !stream.timepoint
+  %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c128}
+  %4 = stream.resource.subview %3[%c0] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+  %5 = stream.resource.subview %3[%c64] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+  %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  util.return %6, %7 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After CSE (cse) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c64_i32 = arith.constant 64 : i32
+  %c0_i32 = arith.constant 0 : i32
+  %c0 = arith.constant 0 : index
+  %c128 = arith.constant 128 : index
+  %c64 = arith.constant 64 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c128} => !stream.timepoint
+  %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}, %result as %arg4: !stream.resource<external>{%c128}) {
+    stream.cmd.concurrent {
+      stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32 : i32) {
+        ro %arg2[%c0 for %c8] : !stream.resource<external>{%c8},
+        wo %arg4[%c0 for %c128] : !stream.resource<external>{%c128}
+      }
+      stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c64_i32 : i32) {
+        ro %arg3[%c0 for %c8] : !stream.resource<external>{%c8},
+        wo %arg4[%c0 for %c128] : !stream.resource<external>{%c128}
+      }
+    }
+  } => !stream.timepoint
+  %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c128}
+  %4 = stream.resource.subview %3[%c0] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+  %5 = stream.resource.subview %3[%c64] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+  %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  util.return %6, %7 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c64_i32 = arith.constant 64 : i32
+  %c0_i32 = arith.constant 0 : i32
+  %c0 = arith.constant 0 : index
+  %c128 = arith.constant 128 : index
+  %c64 = arith.constant 64 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c128} => !stream.timepoint
+  %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}, %result as %arg4: !stream.resource<external>{%c128}) {
+    stream.cmd.concurrent {
+      stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32 : i32) {
+        ro %arg2[%c0 for %c8] : !stream.resource<external>{%c8},
+        wo %arg4[%c0 for %c128] : !stream.resource<external>{%c128}
+      }
+      stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c64_i32 : i32) {
+        ro %arg3[%c0 for %c8] : !stream.resource<external>{%c8},
+        wo %arg4[%c0 for %c128] : !stream.resource<external>{%c128}
+      }
+    }
+  } => !stream.timepoint
+  %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c128}
+  %4 = stream.resource.subview %3[%c0] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+  %5 = stream.resource.subview %3[%c64] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+  %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  util.return %6, %7 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c64_i32 = arith.constant 64 : i32
+  %c0_i32 = arith.constant 0 : i32
+  %c0 = arith.constant 0 : index
+  %c128 = arith.constant 128 : index
+  %c64 = arith.constant 64 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c128} => !stream.timepoint
+  %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}, %result as %arg4: !stream.resource<external>{%c128}) {
+    stream.cmd.concurrent {
+      stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32 : i32) {
+        ro %arg2[%c0 for %c8] : !stream.resource<external>{%c8},
+        wo %arg4[%c0 for %c128] : !stream.resource<external>{%c128}
+      }
+      stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c64_i32 : i32) {
+        ro %arg3[%c0 for %c8] : !stream.resource<external>{%c8},
+        wo %arg4[%c0 for %c128] : !stream.resource<external>{%c128}
+      }
+    }
+  } => !stream.timepoint
+  %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c128}
+  %4 = stream.resource.subview %3[%c0] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+  %5 = stream.resource.subview %3[%c64] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+  %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+  util.return %6, %7 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  stream.executable private @multiple_results_dispatch_0 {
+    stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      stream.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: i32) {
+        %c0 = arith.constant 0 : index
+        %0 = arith.index_castui %arg2 : i32 to index
+        %1 = util.assume.int %0[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>] : index
+        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+        %3 = stream.binding.subspan %arg1[%1] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        %4 = iree_tensor_ext.dispatch.tensor.load %2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %5 = tensor.empty() : tensor<2xf32>
+        %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%4 : tensor<2xf32>) outs(%5 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %7 = math.absf %in : f32
+          linalg.yield %7 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %6, %3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c64_i32 = arith.constant 64 : i32
+    %c0_i32 = arith.constant 0 : i32
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c64 = arith.constant 64 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c128} => !stream.timepoint
+    %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}, %result as %arg4: !stream.resource<external>{%c128}) {
+      stream.cmd.concurrent {
+        stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32 : i32) {
+          ro %arg2[%c0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg4[%c0 for %c128] : !stream.resource<external>{%c128}
+        }
+        stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c64_i32 : i32) {
+          ro %arg3[%c0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg4[%c0 for %c128] : !stream.resource<external>{%c128}
+        }
+      }
+    } => !stream.timepoint
+    %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c128}
+    %4 = stream.resource.subview %3[%c0] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+    %5 = stream.resource.subview %3[%c64] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+    %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    util.return %6, %7 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  stream.executable private @multiple_results_dispatch_0 {
+    stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      stream.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: i32) {
+        %c0 = arith.constant 0 : index
+        %0 = arith.index_castui %arg2 : i32 to index
+        %1 = util.assume.int %0[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>] : index
+        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+        %3 = stream.binding.subspan %arg1[%1] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        %4 = iree_tensor_ext.dispatch.tensor.load %2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %5 = tensor.empty() : tensor<2xf32>
+        %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%4 : tensor<2xf32>) outs(%5 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %7 = math.absf %in : f32
+          linalg.yield %7 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %6, %3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c64_i32 = arith.constant 64 : i32
+    %c0_i32 = arith.constant 0 : i32
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c64 = arith.constant 64 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c128} => !stream.timepoint
+    %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}, %result as %arg4: !stream.resource<external>{%c128}) {
+      stream.cmd.concurrent {
+        stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32 : i32) {
+          ro %arg2[%c0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg4[%c0 for %c128] : !stream.resource<external>{%c128}
+        }
+        stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c64_i32 : i32) {
+          ro %arg3[%c0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg4[%c0 for %c128] : !stream.resource<external>{%c128}
+        }
+      }
+    } => !stream.timepoint
+    %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c128}
+    %4 = stream.resource.subview %3[%c0] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+    %5 = stream.resource.subview %3[%c64] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+    %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    util.return %6, %7 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After VerifyDevicesPass (iree-hal-verify-devices) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  stream.executable private @multiple_results_dispatch_0 {
+    stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      stream.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: i32) {
+        %c0 = arith.constant 0 : index
+        %0 = arith.index_castui %arg2 : i32 to index
+        %1 = util.assume.int %0[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>] : index
+        %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+        %3 = stream.binding.subspan %arg1[%1] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        %4 = iree_tensor_ext.dispatch.tensor.load %2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %5 = tensor.empty() : tensor<2xf32>
+        %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%4 : tensor<2xf32>) outs(%5 : tensor<2xf32>) {
+        ^bb0(%in: f32, %out: f32):
+          %7 = math.absf %in : f32
+          linalg.yield %7 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %6, %3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c64_i32 = arith.constant 64 : i32
+    %c0_i32 = arith.constant 0 : i32
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c64 = arith.constant 64 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c128} => !stream.timepoint
+    %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}, %result as %arg4: !stream.resource<external>{%c128}) {
+      stream.cmd.concurrent {
+        stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32 : i32) {
+          ro %arg2[%c0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg4[%c0 for %c128] : !stream.resource<external>{%c128}
+        }
+        stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c64_i32 : i32) {
+          ro %arg3[%c0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg4[%c0 for %c128] : !stream.resource<external>{%c128}
+        }
+      }
+    } => !stream.timepoint
+    %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c128}
+    %4 = stream.resource.subview %3[%c0] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+    %5 = stream.resource.subview %3[%c64] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+    %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    util.return %6, %7 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After MaterializeInterfacesPass (iree-hal-materialize-interfaces) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  hal.executable private @multiple_results_dispatch_0 {
+    hal.executable.variant public @embedded_elf_arm_64 target(#executable_target_embedded_elf_arm_64) {
+      hal.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 ordinal(0) layout(#pipeline_layout) count(%arg0: !hal.device) -> (index, index, index) {
+        %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+        hal.return %x, %y, %z : index, index, index
+      }
+      builtin.module {
+        func.func @multiple_results_dispatch_0_elementwise_2_f32() {
+          %c0 = arith.constant 0 : index
+          %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32
+          %1 = arith.index_castui %0 : i32 to index
+          %2 = util.assume.int %1[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>] : index
+          %3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+          %4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+          %5 = iree_tensor_ext.dispatch.tensor.load %3, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+          %6 = tensor.empty() : tensor<2xf32>
+          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%5 : tensor<2xf32>) outs(%6 : tensor<2xf32>) {
+          ^bb0(%in: f32, %out: f32):
+            %8 = math.absf %in : f32
+            linalg.yield %8 : f32
+          } -> tensor<2xf32>
+          iree_tensor_ext.dispatch.tensor.store %7, %4, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+          return
+        }
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c64_i32 = arith.constant 64 : i32
+    %c0_i32 = arith.constant 0 : i32
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c64 = arith.constant 64 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c128} => !stream.timepoint
+    %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}, %result as %arg4: !stream.resource<external>{%c128}) {
+      stream.cmd.concurrent {
+        stream.cmd.dispatch @multiple_results_dispatch_0::@embedded_elf_arm_64::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32 : i32) {
+          ro %arg2[%c0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg4[%c0 for %c128] : !stream.resource<external>{%c128}
+        }
+        stream.cmd.dispatch @multiple_results_dispatch_0::@embedded_elf_arm_64::@multiple_results_dispatch_0_elementwise_2_f32(%c64_i32 : i32) {
+          ro %arg3[%c0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg4[%c0 for %c128] : !stream.resource<external>{%c128}
+        }
+      }
+    } => !stream.timepoint
+    %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c128}
+    %4 = stream.resource.subview %3[%c0] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+    %5 = stream.resource.subview %3[%c64] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+    %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    util.return %6, %7 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After PruneExecutablesPass (iree-hal-prune-executables) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0) -> (d0)>
+#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  util.global private @__device_0 = #device_target_local
+  hal.executable private @multiple_results_dispatch_0 {
+    hal.executable.variant public @embedded_elf_arm_64 target(#executable_target_embedded_elf_arm_64) {
+      hal.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 ordinal(0) layout(#pipeline_layout) count(%arg0: !hal.device) -> (index, index, index) {
+        %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+        hal.return %x, %y, %z : index, index, index
+      }
+      builtin.module {
+        func.func @multiple_results_dispatch_0_elementwise_2_f32() {
+          %c0 = arith.constant 0 : index
+          %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32
+          %1 = arith.index_castui %0 : i32 to index
+          %2 = util.assume.int %1[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>] : index
+          %3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+          %4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+          %5 = iree_tensor_ext.dispatch.tensor.load %3, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+          %6 = tensor.empty() : tensor<2xf32>
+          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%5 : tensor<2xf32>) outs(%6 : tensor<2xf32>) {
+          ^bb0(%in: f32, %out: f32):
+            %8 = math.absf %in : f32
+            linalg.yield %8 : f32
+          } -> tensor<2xf32>
+          iree_tensor_ext.dispatch.tensor.store %7, %4, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+          return
+        }
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c64_i32 = arith.constant 64 : i32
+    %c0_i32 = arith.constant 0 : i32
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c64 = arith.constant 64 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource<external>{%c8}
+    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c128} => !stream.timepoint
+    %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c8}, %1 as %arg3: !stream.resource<external>{%c8}, %result as %arg4: !stream.resource<external>{%c128}) {
+      stream.cmd.concurrent {
+        stream.cmd.dispatch @multiple_results_dispatch_0::@embedded_elf_arm_64::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32 : i32) {
+          ro %arg2[%c0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg4[%c0 for %c128] : !stream.resource<external>{%c128}
+        }
+        stream.cmd.dispatch @multiple_results_dispatch_0::@embedded_elf_arm_64::@multiple_results_dispatch_0_elementwise_2_f32(%c64_i32 : i32) {
+          ro %arg3[%c0 for %c8] : !stream.resource<external>{%c8},
+          wo %arg4[%c0 for %c128] : !stream.resource<external>{%c128}
+        }
+      }
+    } => !stream.timepoint
+    %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c128}
+    %4 = stream.resource.subview %3[%c0] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+    %5 = stream.resource.subview %3[%c64] : !stream.resource<external>{%c128} -> !stream.resource<external>{%c8}
+    %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource<external>{%c8} -> !hal.buffer_view
+    util.return %6, %7 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After SpecializeExportsPass (iree-codegen-specialize-exports) //----- //
+hal.executable.variant public @embedded_elf_arm_64 target(<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>) {
+  hal.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 ordinal(0) layout(#hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) count(%arg0: !hal.device) -> (index, index, index) {
+    %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+    hal.return %x, %y, %z : index, index, index
+  }
+  builtin.module {
+    func.func @multiple_results_dispatch_0_elementwise_2_f32() {
+      %c0 = arith.constant 0 : index
+      %0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+      %1 = arith.index_castui %0 : i32 to index
+      %2 = util.assume.int %1[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>] : index
+      %3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+      %4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+      %5 = iree_tensor_ext.dispatch.tensor.load %3, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+      %6 = tensor.empty() : tensor<2xf32>
+      %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%5 : tensor<2xf32>) outs(%6 : tensor<2xf32>) {
+      ^bb0(%in: f32, %out: f32):
+        %8 = math.absf %in : f32
+        linalg.yield %8 : f32
+      } -> tensor<2xf32>
+      iree_tensor_ext.dispatch.tensor.store %7, %4, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+      return
+    }
+  }
+}
+
+// -----// IR Dump After TypePropagationPass (iree-codegen-type-propagation) //----- //
+func.func @multiple_results_dispatch_0_elementwise_2_f32() {
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+  %1 = arith.index_castui %0 : i32 to index
+  %2 = util.assume.int %1[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>] : index
+  %3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+  %4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+  %5 = iree_tensor_ext.dispatch.tensor.load %3, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+  %6 = tensor.empty() : tensor<2xf32>
+  %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%5 : tensor<2xf32>) outs(%6 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %8 = math.absf %in : f32
+    linalg.yield %8 : f32
+  } -> tensor<2xf32>
+  iree_tensor_ext.dispatch.tensor.store %7, %4, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+  return
+}
+
+// -----// IR Dump After BubbleUpOrdinalOpsPass (iree-codegen-bubble-up-ordinal-ops) //----- //
+func.func @multiple_results_dispatch_0_elementwise_2_f32() {
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+  %1 = arith.index_castui %0 : i32 to index
+  %2 = util.assume.int %1[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>] : index
+  %3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+  %4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+  %5 = iree_tensor_ext.dispatch.tensor.load %3, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+  %6 = tensor.empty() : tensor<2xf32>
+  %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%5 : tensor<2xf32>) outs(%6 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %8 = math.absf %in : f32
+    linalg.yield %8 : f32
+  } -> tensor<2xf32>
+  iree_tensor_ext.dispatch.tensor.store %7, %4, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+  return
+}
+
+// -----// IR Dump After BufferizeCopyOnlyDispatchesPass (iree-codegen-bufferize-copy-only-dispatches) //----- //
+func.func @multiple_results_dispatch_0_elementwise_2_f32() {
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+  %1 = arith.index_castui %0 : i32 to index
+  %2 = util.assume.int %1[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>] : index
+  %3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+  %4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+  %5 = iree_tensor_ext.dispatch.tensor.load %3, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+  %6 = tensor.empty() : tensor<2xf32>
+  %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%5 : tensor<2xf32>) outs(%6 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %8 = math.absf %in : f32
+    linalg.yield %8 : f32
+  } -> tensor<2xf32>
+  iree_tensor_ext.dispatch.tensor.store %7, %4, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+  return
+}
+
+// -----// IR Dump After DecomposeSoftmaxPass (iree-codegen-decompose-softmax) //----- //
+func.func @multiple_results_dispatch_0_elementwise_2_f32() {
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+  %1 = arith.index_castui %0 : i32 to index
+  %2 = util.assume.int %1[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>] : index
+  %3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+  %4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+  %5 = iree_tensor_ext.dispatch.tensor.load %3, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+  %6 = tensor.empty() : tensor<2xf32>
+  %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%5 : tensor<2xf32>) outs(%6 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %8 = math.absf %in : f32
+    linalg.yield %8 : f32
+  } -> tensor<2xf32>
+  iree_tensor_ext.dispatch.tensor.store %7, %4, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+  return
+}
+
+// -----// IR Dump After MaterializeUserConfigsPass (iree-codegen-materialize-user-configs) //----- //
+module {
+  func.func @multiple_results_dispatch_0_elementwise_2_f32() {
+    %c0 = arith.constant 0 : index
+    %0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+    %1 = arith.index_castui %0 : i32 to index
+    %2 = util.assume.int %1[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>] : index
+    %3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+    %4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+    %5 = iree_tensor_ext.dispatch.tensor.load %3, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+    %6 = tensor.empty() : tensor<2xf32>
+    %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%5 : tensor<2xf32>) outs(%6 : tensor<2xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %8 = math.absf %in : f32
+      linalg.yield %8 : f32
+    } -> tensor<2xf32>
+    iree_tensor_ext.dispatch.tensor.store %7, %4, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+    return
+  }
+}
+
+// -----// IR Dump After MaterializeDeviceEncodingPass (iree-codegen-materialize-device-encoding) //----- //
+func.func @multiple_results_dispatch_0_elementwise_2_f32() {
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+  %1 = arith.index_castui %0 : i32 to index
+  %2 = util.assume.int %1[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>] : index
+  %3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+  %4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+  %5 = iree_tensor_ext.dispatch.tensor.load %3, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+  %6 = tensor.empty() : tensor<2xf32>
+  %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%5 : tensor<2xf32>) outs(%6 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %8 = math.absf %in : f32
+    linalg.yield %8 : f32
+  } -> tensor<2xf32>
+  iree_tensor_ext.dispatch.tensor.store %7, %4, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+  return
+}
+
+// -----// IR Dump After CPUPropagateDataLayoutPass (iree-codegen-cpu-propagate-data-layout) //----- //
+func.func @multiple_results_dispatch_0_elementwise_2_f32() {
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+  %1 = arith.index_castui %0 : i32 to index
+  %2 = util.assume.int %1[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>] : index
+  %3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+  %4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+  %5 = iree_tensor_ext.dispatch.tensor.load %3, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+  %6 = tensor.empty() : tensor<2xf32>
+  %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%5 : tensor<2xf32>) outs(%6 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %8 = math.absf %in : f32
+    linalg.yield %8 : f32
+  } -> tensor<2xf32>
+  iree_tensor_ext.dispatch.tensor.store %7, %4, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+  return
+}
+
+// -----// IR Dump After RematerializeParallelOpsPass (iree-codegen-rematerialize-parallel-ops) //----- //
+func.func @multiple_results_dispatch_0_elementwise_2_f32() {
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+  %1 = arith.index_castui %0 : i32 to index
+  %2 = util.assume.int %1[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>] : index
+  %3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+  %4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+  %5 = iree_tensor_ext.dispatch.tensor.load %3, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+  %6 = tensor.empty() : tensor<2xf32>
+  %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%5 : tensor<2xf32>) outs(%6 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %8 = math.absf %in : f32
+    linalg.yield %8 : f32
+  } -> tensor<2xf32>
+  iree_tensor_ext.dispatch.tensor.store %7, %4, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+  return
+}
+
+// -----// IR Dump After ExpandF16OpToF32Pass (iree-llvmcpu-expand-f16-op-to-f32) //----- //
+func.func @multiple_results_dispatch_0_elementwise_2_f32() {
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+  %1 = arith.index_castui %0 : i32 to index
+  %2 = util.assume.int %1[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>] : index
+  %3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+  %4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+  %5 = iree_tensor_ext.dispatch.tensor.load %3, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+  %6 = tensor.empty() : tensor<2xf32>
+  %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%5 : tensor<2xf32>) outs(%6 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %8 = math.absf %in : f32
+    linalg.yield %8 : f32
+  } -> tensor<2xf32>
+  iree_tensor_ext.dispatch.tensor.store %7, %4, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+  return
+}
+
+// -----// IR Dump After ConvertAccGEMMToGEMMPass (iree-convert-accgemm-to-gemm) //----- //
+func.func @multiple_results_dispatch_0_elementwise_2_f32() {
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+  %1 = arith.index_castui %0 : i32 to index
+  %2 = util.assume.int %1[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>] : index
+  %3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+  %4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+  %5 = iree_tensor_ext.dispatch.tensor.load %3, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+  %6 = tensor.empty() : tensor<2xf32>
+  %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%5 : tensor<2xf32>) outs(%6 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %8 = math.absf %in : f32
+    linalg.yield %8 : f32
+  } -> tensor<2xf32>
+  iree_tensor_ext.dispatch.tensor.store %7, %4, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+  return
+}
+
+// -----// IR Dump After EraseHALDescriptorTypeFromMemRefPass (iree-codegen-erase-hal-descriptor-type-from-memref) //----- //
+func.func @multiple_results_dispatch_0_elementwise_2_f32() {
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+  %1 = arith.index_castui %0 : i32 to index
+  %2 = util.assume.int %1[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>] : index
+  %3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+  %4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+  %5 = iree_tensor_ext.dispatch.tensor.load %3, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+  %6 = tensor.empty() : tensor<2xf32>
+  %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%5 : tensor<2xf32>) outs(%6 : tensor<2xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %8 = math.absf %in : f32
+    linalg.yield %8 : f32
+  } -> tensor<2xf32>
+  iree_tensor_ext.dispatch.tensor.store %7, %4, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+  return
+}
+
+// -----// IR Dump After LLVMCPUSelectLoweringStrategyPass (iree-llvmcpu-select-lowering-strategy) //----- //
+module {
+  func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert, {enable_loop_peeling}>} {
+    %c0 = arith.constant 0 : index
+    %0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+    %1 = arith.index_castui %0 : i32 to index
+    %2 = util.assume.int %1[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>] : index
+    %3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+    %4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+    %5 = iree_tensor_ext.dispatch.tensor.load %3, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+    %6 = tensor.empty() : tensor<2xf32>
+    %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%5 : tensor<2xf32>) outs(%6 : tensor<2xf32>) attrs =  {lowering_config = #iree_cpu.lowering_config<distribution = [2], vector_common_parallel = [4]>} {
+    ^bb0(%in: f32, %out: f32):
+      %8 = math.absf %in : f32
+      linalg.yield %8 : f32
+    } -> tensor<2xf32>
+    iree_tensor_ext.dispatch.tensor.store %7, %4, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+    return
+  }
+}
+
+// -----// IR Dump After ConfigureTargetExecutableVariantsPass (iree-hal-configure-target-executable-variants) //----- //
+hal.executable.variant public @embedded_elf_arm_64 target(<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>) {
+  hal.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 ordinal(0) layout(#hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) count(%arg0: !hal.device) -> (index, index, index) {
+    %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+    hal.return %x, %y, %z : index, index, index
+  }
+  builtin.module {
+    func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert, {enable_loop_peeling}>} {
+      %c0 = arith.constant 0 : index
+      %0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+      %1 = arith.index_castui %0 : i32 to index
+      %2 = util.assume.int %1[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>] : index
+      %3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+      %4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+      %5 = iree_tensor_ext.dispatch.tensor.load %3, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+      %6 = tensor.empty() : tensor<2xf32>
+      %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%5 : tensor<2xf32>) outs(%6 : tensor<2xf32>) attrs =  {lowering_config = #iree_cpu.lowering_config<distribution = [2], vector_common_parallel = [4]>} {
+      ^bb0(%in: f32, %out: f32):
+        %8 = math.absf %in : f32
+        linalg.yield %8 : f32
+      } -> tensor<2xf32>
+      iree_tensor_ext.dispatch.tensor.store %7, %4, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+      return
+    }
+  }
+}
+
+// -----// IR Dump After ConfigureExecutablesPass (iree-hal-configure-executables) //----- //
+hal.executable private @multiple_results_dispatch_0 {
+  hal.executable.variant public @embedded_elf_arm_64 target(<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>) {
+    hal.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 ordinal(0) layout(#hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) count(%arg0: !hal.device) -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      hal.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert, {enable_loop_peeling}>} {
+        %c0 = arith.constant 0 : index
+        %0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+        %1 = arith.index_castui %0 : i32 to index
+        %2 = util.assume.int %1[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>] : index
+        %3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+        %4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        %5 = iree_tensor_ext.dispatch.tensor.load %3, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+        %6 = tensor.empty() : tensor<2xf32>
+        %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%5 : tensor<2xf32>) outs(%6 : tensor<2xf32>) attrs =  {lowering_config = #iree_cpu.lowering_config<distribution = [2], vector_common_parallel = [4]>} {
+        ^bb0(%in: f32, %out: f32):
+          %8 = math.absf %in : f32
+          linalg.yield %8 : f32
+        } -> tensor<2xf32>
+        iree_tensor_ext.dispatch.tensor.store %7, %4, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+        return
+      }
+    }
+  }
+}
+
+// -----// IR Dump After LowerExecutableUsingTransformDialectPass (iree-codegen-lower-executable-using-transform-dialect) //----- //
+module {
+  func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert, {enable_loop_peeling}>} {
+    %c0 = arith.constant 0 : index
+    %0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+    %1 = arith.index_castui %0 : i32 to index
+    %2 = util.assume.int %1[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>] : index
+    %3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+    %4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+    %5 = iree_tensor_ext.dispatch.tensor.load %3, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+    %6 = tensor.empty() : tensor<2xf32>
+    %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%5 : tensor<2xf32>) outs(%6 : tensor<2xf32>) attrs =  {lowering_config = #iree_cpu.lowering_config<distribution = [2], vector_common_parallel = [4]>} {
+    ^bb0(%in: f32, %out: f32):
+      %8 = math.absf %in : f32
+      linalg.yield %8 : f32
+    } -> tensor<2xf32>
+    iree_tensor_ext.dispatch.tensor.store %7, %4, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+    return
+  }
+}
+
+// -----// IR Dump After TileAndDistributeToWorkgroupsUsingForallOpPass (iree-codegen-tile-and-distribute-to-workgroups-using-forall-op) //----- //
+func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert, {enable_loop_peeling}>} {
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+  %1 = arith.index_castui %0 : i32 to index
+  %2 = util.assume.int %1[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>] : index
+  %3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+  %4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+  %5 = iree_tensor_ext.dispatch.tensor.load %3, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32>
+  %6 = tensor.empty() : tensor<2xf32>
+  %7 = scf.forall (%arg0) = (0) to (2) step (2) shared_outs(%arg1 = %6) -> (tensor<2xf32>) {
+    %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%5 : tensor<2xf32>) outs(%arg1 : tensor<2xf32>) attrs =  {lowering_config = #iree_cpu.lowering_config<distribution = [2], vector_common_parallel = [4]>} {
+    ^bb0(%in: f32, %out: f32):
+      %9 = math.absf %in : f32
+      linalg.yield %9 : f32
+    } -> tensor<2xf32>
+    scf.forall.in_parallel {
+      tensor.parallel_insert_slice %8 into %arg1[%c0] [2] [1] : tensor<2xf32> into tensor<2xf32>
+    }
+  } {mapping = [#iree_codegen.workgroup_mapping<x>]}
+  iree_tensor_ext.dispatch.tensor.store %7, %4, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+  return
+}
+
+// -----// IR Dump After BufferizeDispatchTensorLoadStorePass (iree-codegen-bufferize-dispatch-tensor-load-store) //----- //
+func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert, {enable_loop_peeling}>} {
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+  %1 = arith.index_castui %0 : i32 to index
+  %2 = util.assume.int %1[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>] : index
+  %3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type<storage_buffer>>
+  %4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2xf32>>
+  %5 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  %6 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2xf32>>
+  %7 = iree_codegen.load_from_buffer %3 : memref<2xf32, #hal.descriptor_type<storage_buffer>> -> tensor<2xf32>
+  %8 = tensor.empty() : tensor<2xf32>
+  %9 = scf.forall (%arg0) = (0) to (2) step (2) shared_outs(%arg1 = %8) -> (tensor<2xf32>) {
+    %10 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%7 : tensor<2xf32>) outs(%arg1 : tensor<2xf32>) attrs =  {lowering_config = #iree_cpu.lowering_config<distribution = [2], vector_common_parallel = [4]>} {
+    ^bb0(%in: f32, %out: f32):
+      %11 = math.absf %in : f32
+      linalg.yield %11 : f32
+    } -> tensor<2xf32>
+    scf.forall.in_parallel {
+      tensor.parallel_insert_slice %10 into %arg1[%c0] [2] [1] : tensor<2xf32> into tensor<2xf32>
+    }
+  } {mapping = [#iree_codegen.workgroup_mapping<x>]}
+  iree_codegen.store_to_buffer %9, %5 : tensor<2xf32> into memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  return
+}
+
+// -----// IR Dump After CombineLayoutTransformationPass (iree-codegen-combine-layout-transformation) //----- //
+func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert, {enable_loop_peeling}>} {
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+  %1 = arith.index_castui %0 : i32 to index
+  %2 = util.assume.int %1[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>] : index
+  %3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type<storage_buffer>>
+  %4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  %5 = iree_codegen.load_from_buffer %3 : memref<2xf32, #hal.descriptor_type<storage_buffer>> -> tensor<2xf32>
+  %6 = tensor.empty() : tensor<2xf32>
+  %7 = scf.forall (%arg0) = (0) to (2) step (2) shared_outs(%arg1 = %6) -> (tensor<2xf32>) {
+    %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%5 : tensor<2xf32>) outs(%arg1 : tensor<2xf32>) attrs =  {lowering_config = #iree_cpu.lowering_config<distribution = [2], vector_common_parallel = [4]>} {
+    ^bb0(%in: f32, %out: f32):
+      %9 = math.absf %in : f32
+      linalg.yield %9 : f32
+    } -> tensor<2xf32>
+    scf.forall.in_parallel {
+      tensor.parallel_insert_slice %8 into %arg1[%c0] [2] [1] : tensor<2xf32> into tensor<2xf32>
+    }
+  } {mapping = [#iree_codegen.workgroup_mapping<x>]}
+  iree_codegen.store_to_buffer %7, %4 : tensor<2xf32> into memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  return
+}
+
+// -----// IR Dump After ConfigTrackingCanonicalizerPass (iree-codegen-config-tracking-canonicalize) //----- //
+func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert, {enable_loop_peeling}>} {
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+  %1 = arith.index_castui %0 : i32 to index
+  %2 = util.assume.int %1[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>] : index
+  %3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type<storage_buffer>>
+  %4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  %5 = iree_codegen.load_from_buffer %3 : memref<2xf32, #hal.descriptor_type<storage_buffer>> -> tensor<2xf32>
+  %6 = tensor.empty() : tensor<2xf32>
+  %7 = scf.forall (%arg0) = (0) to (2) step (2) shared_outs(%arg1 = %6) -> (tensor<2xf32>) {
+    %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%5 : tensor<2xf32>) outs(%arg1 : tensor<2xf32>) attrs =  {lowering_config = #iree_cpu.lowering_config<distribution = [2], vector_common_parallel = [4]>} {
+    ^bb0(%in: f32, %out: f32):
+      %9 = math.absf %in : f32
+      linalg.yield %9 : f32
+    } -> tensor<2xf32>
+    scf.forall.in_parallel {
+      tensor.parallel_insert_slice %8 into %arg1[0] [2] [1] : tensor<2xf32> into tensor<2xf32>
+    }
+  } {mapping = [#iree_codegen.workgroup_mapping<x>]}
+  iree_codegen.store_to_buffer %7, %4 : tensor<2xf32> into memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  return
+}
+
+// -----// IR Dump After CSE (cse) //----- //
+func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert, {enable_loop_peeling}>} {
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+  %1 = arith.index_castui %0 : i32 to index
+  %2 = util.assume.int %1[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>] : index
+  %3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type<storage_buffer>>
+  %4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  %5 = iree_codegen.load_from_buffer %3 : memref<2xf32, #hal.descriptor_type<storage_buffer>> -> tensor<2xf32>
+  %6 = tensor.empty() : tensor<2xf32>
+  %7 = scf.forall (%arg0) = (0) to (2) step (2) shared_outs(%arg1 = %6) -> (tensor<2xf32>) {
+    %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%5 : tensor<2xf32>) outs(%arg1 : tensor<2xf32>) attrs =  {lowering_config = #iree_cpu.lowering_config<distribution = [2], vector_common_parallel = [4]>} {
+    ^bb0(%in: f32, %out: f32):
+      %9 = math.absf %in : f32
+      linalg.yield %9 : f32
+    } -> tensor<2xf32>
+    scf.forall.in_parallel {
+      tensor.parallel_insert_slice %8 into %arg1[0] [2] [1] : tensor<2xf32> into tensor<2xf32>
+    }
+  } {mapping = [#iree_codegen.workgroup_mapping<x>]}
+  iree_codegen.store_to_buffer %7, %4 : tensor<2xf32> into memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  return
+}
+
+// -----// IR Dump After FuseTensorPadWithConsumerPass (iree-codegen-fuse-tensor-pad-with-consumer) //----- //
+func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert, {enable_loop_peeling}>} {
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+  %1 = arith.index_castui %0 : i32 to index
+  %2 = util.assume.int %1[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>] : index
+  %3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type<storage_buffer>>
+  %4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  %5 = iree_codegen.load_from_buffer %3 : memref<2xf32, #hal.descriptor_type<storage_buffer>> -> tensor<2xf32>
+  %6 = tensor.empty() : tensor<2xf32>
+  %7 = scf.forall (%arg0) = (0) to (2) step (2) shared_outs(%arg1 = %6) -> (tensor<2xf32>) {
+    %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%5 : tensor<2xf32>) outs(%arg1 : tensor<2xf32>) attrs =  {lowering_config = #iree_cpu.lowering_config<distribution = [2], vector_common_parallel = [4]>} {
+    ^bb0(%in: f32, %out: f32):
+      %9 = math.absf %in : f32
+      linalg.yield %9 : f32
+    } -> tensor<2xf32>
+    scf.forall.in_parallel {
+      tensor.parallel_insert_slice %8 into %arg1[0] [2] [1] : tensor<2xf32> into tensor<2xf32>
+    }
+  } {mapping = [#iree_codegen.workgroup_mapping<x>]}
+  iree_codegen.store_to_buffer %7, %4 : tensor<2xf32> into memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  return
+}
+
+// -----// IR Dump After ConcretizePadResultShapePass (iree-codegen-concretize-pad-result-shape) //----- //
+func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert, {enable_loop_peeling}>} {
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+  %1 = arith.index_castui %0 : i32 to index
+  %2 = util.assume.int %1[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>] : index
+  %3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type<storage_buffer>>
+  %4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  %5 = iree_codegen.load_from_buffer %3 : memref<2xf32, #hal.descriptor_type<storage_buffer>> -> tensor<2xf32>
+  %6 = tensor.empty() : tensor<2xf32>
+  %7 = scf.forall (%arg0) = (0) to (2) step (2) shared_outs(%arg1 = %6) -> (tensor<2xf32>) {
+    %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%5 : tensor<2xf32>) outs(%arg1 : tensor<2xf32>) attrs =  {lowering_config = #iree_cpu.lowering_config<distribution = [2], vector_common_parallel = [4]>} {
+    ^bb0(%in: f32, %out: f32):
+      %9 = math.absf %in : f32
+      linalg.yield %9 : f32
+    } -> tensor<2xf32>
+    scf.forall.in_parallel {
+      tensor.parallel_insert_slice %8 into %arg1[0] [2] [1] : tensor<2xf32> into tensor<2xf32>
+    }
+  } {mapping = [#iree_codegen.workgroup_mapping<x>]}
+  iree_codegen.store_to_buffer %7, %4 : tensor<2xf32> into memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  return
+}
+
+// -----// IR Dump After PropagateDispatchSizeBoundsPass (iree-codegen-propagate-dispatch-size-bounds) //----- //
+func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert, {enable_loop_peeling}>} {
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+  %1 = arith.index_castui %0 : i32 to index
+  %2 = util.assume.int %1[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>] : index
+  %3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type<storage_buffer>>
+  %4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  %5 = iree_codegen.load_from_buffer %3 : memref<2xf32, #hal.descriptor_type<storage_buffer>> -> tensor<2xf32>
+  %6 = tensor.empty() : tensor<2xf32>
+  %7 = scf.forall (%arg0) = (0) to (2) step (2) shared_outs(%arg1 = %6) -> (tensor<2xf32>) {
+    %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%5 : tensor<2xf32>) outs(%arg1 : tensor<2xf32>) attrs =  {lowering_config = #iree_cpu.lowering_config<distribution = [2], vector_common_parallel = [4]>} {
+    ^bb0(%in: f32, %out: f32):
+      %9 = math.absf %in : f32
+      linalg.yield %9 : f32
+    } -> tensor<2xf32>
+    scf.forall.in_parallel {
+      tensor.parallel_insert_slice %8 into %arg1[0] [2] [1] : tensor<2xf32> into tensor<2xf32>
+    }
+  } {mapping = [#iree_codegen.workgroup_mapping<x>]}
+  iree_codegen.store_to_buffer %7, %4 : tensor<2xf32> into memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  return
+}
+
+// -----// IR Dump After LLVMCPUTileAndFuseProducerConsumerPass (iree-llvmcpu-tile-and-fuse-producer-consumer) //----- //
+func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert, {enable_loop_peeling}>} {
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+  %1 = arith.index_castui %0 : i32 to index
+  %2 = util.assume.int %1[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>] : index
+  %3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type<storage_buffer>>
+  %4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  %5 = iree_codegen.load_from_buffer %3 : memref<2xf32, #hal.descriptor_type<storage_buffer>> -> tensor<2xf32>
+  %6 = tensor.empty() : tensor<2xf32>
+  %7 = scf.forall (%arg0) = (0) to (2) step (2) shared_outs(%arg1 = %6) -> (tensor<2xf32>) {
+    %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%5 : tensor<2xf32>) outs(%arg1 : tensor<2xf32>) attrs =  {lowering_config = #iree_cpu.lowering_config<distribution = [2], vector_common_parallel = [4]>} {
+    ^bb0(%in: f32, %out: f32):
+      %9 = math.absf %in : f32
+      linalg.yield %9 : f32
+    } -> tensor<2xf32>
+    scf.forall.in_parallel {
+      tensor.parallel_insert_slice %8 into %arg1[0] [2] [1] : tensor<2xf32> into tensor<2xf32>
+    }
+  } {mapping = [#iree_codegen.workgroup_mapping<x>]}
+  iree_codegen.store_to_buffer %7, %4 : tensor<2xf32> into memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  return
+}
+
+// -----// IR Dump After FuseTensorPadWithConsumerPass (iree-codegen-fuse-tensor-pad-with-consumer) //----- //
+func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert, {enable_loop_peeling}>} {
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+  %1 = arith.index_castui %0 : i32 to index
+  %2 = util.assume.int %1[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>] : index
+  %3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type<storage_buffer>>
+  %4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  %5 = iree_codegen.load_from_buffer %3 : memref<2xf32, #hal.descriptor_type<storage_buffer>> -> tensor<2xf32>
+  %6 = tensor.empty() : tensor<2xf32>
+  %7 = scf.forall (%arg0) = (0) to (2) step (2) shared_outs(%arg1 = %6) -> (tensor<2xf32>) {
+    %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%5 : tensor<2xf32>) outs(%arg1 : tensor<2xf32>) attrs =  {lowering_config = #iree_cpu.lowering_config<distribution = [2], vector_common_parallel = [4]>} {
+    ^bb0(%in: f32, %out: f32):
+      %9 = math.absf %in : f32
+      linalg.yield %9 : f32
+    } -> tensor<2xf32>
+    scf.forall.in_parallel {
+      tensor.parallel_insert_slice %8 into %arg1[0] [2] [1] : tensor<2xf32> into tensor<2xf32>
+    }
+  } {mapping = [#iree_codegen.workgroup_mapping<x>]}
+  iree_codegen.store_to_buffer %7, %4 : tensor<2xf32> into memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  return
+}
+
+// -----// IR Dump After ConcretizePadResultShapePass (iree-codegen-concretize-pad-result-shape) //----- //
+func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert, {enable_loop_peeling}>} {
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+  %1 = arith.index_castui %0 : i32 to index
+  %2 = util.assume.int %1[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>] : index
+  %3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type<storage_buffer>>
+  %4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  %5 = iree_codegen.load_from_buffer %3 : memref<2xf32, #hal.descriptor_type<storage_buffer>> -> tensor<2xf32>
+  %6 = tensor.empty() : tensor<2xf32>
+  %7 = scf.forall (%arg0) = (0) to (2) step (2) shared_outs(%arg1 = %6) -> (tensor<2xf32>) {
+    %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%5 : tensor<2xf32>) outs(%arg1 : tensor<2xf32>) attrs =  {lowering_config = #iree_cpu.lowering_config<distribution = [2], vector_common_parallel = [4]>} {
+    ^bb0(%in: f32, %out: f32):
+      %9 = math.absf %in : f32
+      linalg.yield %9 : f32
+    } -> tensor<2xf32>
+    scf.forall.in_parallel {
+      tensor.parallel_insert_slice %8 into %arg1[0] [2] [1] : tensor<2xf32> into tensor<2xf32>
+    }
+  } {mapping = [#iree_codegen.workgroup_mapping<x>]}
+  iree_codegen.store_to_buffer %7, %4 : tensor<2xf32> into memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  return
+}
+
+// -----// IR Dump After LLVMCPUTileAndFuseProducerConsumerPass (iree-llvmcpu-tile-and-fuse-producer-consumer) //----- //
+func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert, {enable_loop_peeling}>} {
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+  %1 = arith.index_castui %0 : i32 to index
+  %2 = util.assume.int %1[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>] : index
+  %3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type<storage_buffer>>
+  %4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  %5 = iree_codegen.load_from_buffer %3 : memref<2xf32, #hal.descriptor_type<storage_buffer>> -> tensor<2xf32>
+  %6 = tensor.empty() : tensor<2xf32>
+  %7 = scf.forall (%arg0) = (0) to (2) step (2) shared_outs(%arg1 = %6) -> (tensor<2xf32>) {
+    %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%5 : tensor<2xf32>) outs(%arg1 : tensor<2xf32>) attrs =  {lowering_config = #iree_cpu.lowering_config<distribution = [2], vector_common_parallel = [4]>} {
+    ^bb0(%in: f32, %out: f32):
+      %9 = math.absf %in : f32
+      linalg.yield %9 : f32
+    } -> tensor<2xf32>
+    scf.forall.in_parallel {
+      tensor.parallel_insert_slice %8 into %arg1[0] [2] [1] : tensor<2xf32> into tensor<2xf32>
+    }
+  } {mapping = [#iree_codegen.workgroup_mapping<x>]}
+  iree_codegen.store_to_buffer %7, %4 : tensor<2xf32> into memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  return
+}
+
+// -----// IR Dump After FuseTensorPadWithConsumerPass (iree-codegen-fuse-tensor-pad-with-consumer) //----- //
+func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert, {enable_loop_peeling}>} {
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+  %1 = arith.index_castui %0 : i32 to index
+  %2 = util.assume.int %1[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>] : index
+  %3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type<storage_buffer>>
+  %4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  %5 = iree_codegen.load_from_buffer %3 : memref<2xf32, #hal.descriptor_type<storage_buffer>> -> tensor<2xf32>
+  %6 = tensor.empty() : tensor<2xf32>
+  %7 = scf.forall (%arg0) = (0) to (2) step (2) shared_outs(%arg1 = %6) -> (tensor<2xf32>) {
+    %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%5 : tensor<2xf32>) outs(%arg1 : tensor<2xf32>) attrs =  {lowering_config = #iree_cpu.lowering_config<distribution = [2], vector_common_parallel = [4]>} {
+    ^bb0(%in: f32, %out: f32):
+      %9 = math.absf %in : f32
+      linalg.yield %9 : f32
+    } -> tensor<2xf32>
+    scf.forall.in_parallel {
+      tensor.parallel_insert_slice %8 into %arg1[0] [2] [1] : tensor<2xf32> into tensor<2xf32>
+    }
+  } {mapping = [#iree_codegen.workgroup_mapping<x>]}
+  iree_codegen.store_to_buffer %7, %4 : tensor<2xf32> into memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  return
+}
+
+// -----// IR Dump After ConcretizePadResultShapePass (iree-codegen-concretize-pad-result-shape) //----- //
+func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert, {enable_loop_peeling}>} {
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+  %1 = arith.index_castui %0 : i32 to index
+  %2 = util.assume.int %1[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>] : index
+  %3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type<storage_buffer>>
+  %4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  %5 = iree_codegen.load_from_buffer %3 : memref<2xf32, #hal.descriptor_type<storage_buffer>> -> tensor<2xf32>
+  %6 = tensor.empty() : tensor<2xf32>
+  %7 = scf.forall (%arg0) = (0) to (2) step (2) shared_outs(%arg1 = %6) -> (tensor<2xf32>) {
+    %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%5 : tensor<2xf32>) outs(%arg1 : tensor<2xf32>) attrs =  {lowering_config = #iree_cpu.lowering_config<distribution = [2], vector_common_parallel = [4]>} {
+    ^bb0(%in: f32, %out: f32):
+      %9 = math.absf %in : f32
+      linalg.yield %9 : f32
+    } -> tensor<2xf32>
+    scf.forall.in_parallel {
+      tensor.parallel_insert_slice %8 into %arg1[0] [2] [1] : tensor<2xf32> into tensor<2xf32>
+    }
+  } {mapping = [#iree_codegen.workgroup_mapping<x>]}
+  iree_codegen.store_to_buffer %7, %4 : tensor<2xf32> into memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  return
+}
+
+// -----// IR Dump After ForallToForPass (iree-codegen-forall-to-for) //----- //
+func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert, {enable_loop_peeling}>} {
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+  %1 = arith.index_castui %0 : i32 to index
+  %2 = util.assume.int %1[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>] : index
+  %3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type<storage_buffer>>
+  %4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  %5 = iree_codegen.load_from_buffer %3 : memref<2xf32, #hal.descriptor_type<storage_buffer>> -> tensor<2xf32>
+  %6 = tensor.empty() : tensor<2xf32>
+  %7 = scf.forall (%arg0) = (0) to (2) step (2) shared_outs(%arg1 = %6) -> (tensor<2xf32>) {
+    %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%5 : tensor<2xf32>) outs(%arg1 : tensor<2xf32>) attrs =  {lowering_config = #iree_cpu.lowering_config<distribution = [2], vector_common_parallel = [4]>} {
+    ^bb0(%in: f32, %out: f32):
+      %9 = math.absf %in : f32
+      linalg.yield %9 : f32
+    } -> tensor<2xf32>
+    scf.forall.in_parallel {
+      tensor.parallel_insert_slice %8 into %arg1[0] [2] [1] : tensor<2xf32> into tensor<2xf32>
+    }
+  } {mapping = [#iree_codegen.workgroup_mapping<x>]}
+  iree_codegen.store_to_buffer %7, %4 : tensor<2xf32> into memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  return
+}
+
+// -----// IR Dump After LLVMCPUPeelPass (iree-llvmcpu-peel) //----- //
+func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert, {enable_loop_peeling}>} {
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+  %1 = arith.index_castui %0 : i32 to index
+  %2 = util.assume.int %1[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>] : index
+  %3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type<storage_buffer>>
+  %4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  %5 = iree_codegen.load_from_buffer %3 : memref<2xf32, #hal.descriptor_type<storage_buffer>> -> tensor<2xf32>
+  %6 = tensor.empty() : tensor<2xf32>
+  %7 = scf.forall (%arg0) = (0) to (2) step (2) shared_outs(%arg1 = %6) -> (tensor<2xf32>) {
+    %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%5 : tensor<2xf32>) outs(%arg1 : tensor<2xf32>) attrs =  {lowering_config = #iree_cpu.lowering_config<distribution = [2], vector_common_parallel = [4]>} {
+    ^bb0(%in: f32, %out: f32):
+      %9 = math.absf %in : f32
+      linalg.yield %9 : f32
+    } -> tensor<2xf32>
+    scf.forall.in_parallel {
+      tensor.parallel_insert_slice %8 into %arg1[0] [2] [1] : tensor<2xf32> into tensor<2xf32>
+    }
+  } {mapping = [#iree_codegen.workgroup_mapping<x>]}
+  iree_codegen.store_to_buffer %7, %4 : tensor<2xf32> into memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  return
+}
+
+// -----// IR Dump After TensorToVectorVectorizePadPass (iree-codegen-vectorize-tensor-pad) //----- //
+func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert, {enable_loop_peeling}>} {
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+  %1 = arith.index_castui %0 : i32 to index
+  %2 = util.assume.int %1[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>] : index
+  %3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type<storage_buffer>>
+  %4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  %5 = iree_codegen.load_from_buffer %3 : memref<2xf32, #hal.descriptor_type<storage_buffer>> -> tensor<2xf32>
+  %6 = tensor.empty() : tensor<2xf32>
+  %7 = scf.forall (%arg0) = (0) to (2) step (2) shared_outs(%arg1 = %6) -> (tensor<2xf32>) {
+    %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%5 : tensor<2xf32>) outs(%arg1 : tensor<2xf32>) attrs =  {lowering_config = #iree_cpu.lowering_config<distribution = [2], vector_common_parallel = [4]>} {
+    ^bb0(%in: f32, %out: f32):
+      %9 = math.absf %in : f32
+      linalg.yield %9 : f32
+    } -> tensor<2xf32>
+    scf.forall.in_parallel {
+      tensor.parallel_insert_slice %8 into %arg1[0] [2] [1] : tensor<2xf32> into tensor<2xf32>
+    }
+  } {mapping = [#iree_codegen.workgroup_mapping<x>]}
+  iree_codegen.store_to_buffer %7, %4 : tensor<2xf32> into memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  return
+}
+
+// -----// IR Dump After LLVMCPUTileToVectorSizePass (iree-llvmcpu-tile-to-vector-size) //----- //
+func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert, {enable_loop_peeling}>} {
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+  %1 = arith.index_castui %0 : i32 to index
+  %2 = util.assume.int %1[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>] : index
+  %3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type<storage_buffer>>
+  %4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  %5 = iree_codegen.load_from_buffer %3 : memref<2xf32, #hal.descriptor_type<storage_buffer>> -> tensor<2xf32>
+  %6 = tensor.empty() : tensor<2xf32>
+  %7 = scf.forall (%arg0) = (0) to (2) step (2) shared_outs(%arg1 = %6) -> (tensor<2xf32>) {
+    %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%5 : tensor<2xf32>) outs(%arg1 : tensor<2xf32>) attrs =  {lowering_config = #iree_cpu.lowering_config<distribution = [2], vector_common_parallel = [4]>} {
+    ^bb0(%in: f32, %out: f32):
+      %9 = math.absf %in : f32
+      linalg.yield %9 : f32
+    } -> tensor<2xf32>
+    scf.forall.in_parallel {
+      tensor.parallel_insert_slice %8 into %arg1[0] [2] [1] : tensor<2xf32> into tensor<2xf32>
+    }
+  } {mapping = [#iree_codegen.workgroup_mapping<x>]}
+  iree_codegen.store_to_buffer %7, %4 : tensor<2xf32> into memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  return
+}
+
+// -----// IR Dump After GenericVectorizationPass (iree-codegen-generic-vectorization) //----- //
+func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert, {enable_loop_peeling}>} {
+  %0 = ub.poison : f32
+  %c0 = arith.constant 0 : index
+  %1 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+  %2 = arith.index_castui %1 : i32 to index
+  %3 = util.assume.int %2[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>] : index
+  %4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type<storage_buffer>>
+  %5 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%3) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  %6 = iree_codegen.load_from_buffer %4 : memref<2xf32, #hal.descriptor_type<storage_buffer>> -> tensor<2xf32>
+  %7 = tensor.empty() : tensor<2xf32>
+  %8 = scf.forall (%arg0) = (0) to (2) step (2) shared_outs(%arg1 = %7) -> (tensor<2xf32>) {
+    %9 = vector.transfer_read %6[%c0], %0 {in_bounds = [true]} : tensor<2xf32>, vector<2xf32>
+    %10 = math.absf %9 : vector<2xf32>
+    %11 = vector.transfer_write %10, %arg1[%c0] {in_bounds = [true]} : vector<2xf32>, tensor<2xf32>
+    scf.forall.in_parallel {
+      tensor.parallel_insert_slice %11 into %arg1[0] [2] [1] : tensor<2xf32> into tensor<2xf32>
+    }
+  } {mapping = [#iree_codegen.workgroup_mapping<x>]}
+  iree_codegen.store_to_buffer %8, %5 : tensor<2xf32> into memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  return
+}
+
+// -----// IR Dump After OptimizeTensorInsertExtractSlicesPass (iree-codegen-optimize-tensor-insert-extract-slices) //----- //
+func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert, {enable_loop_peeling}>} {
+  %0 = ub.poison : f32
+  %c0 = arith.constant 0 : index
+  %1 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+  %2 = arith.index_castui %1 : i32 to index
+  %3 = util.assume.int %2[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>] : index
+  %4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type<storage_buffer>>
+  %5 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%3) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  %6 = iree_codegen.load_from_buffer %4 : memref<2xf32, #hal.descriptor_type<storage_buffer>> -> tensor<2xf32>
+  %7 = tensor.empty() : tensor<2xf32>
+  %8 = scf.forall (%arg0) = (0) to (2) step (2) shared_outs(%arg1 = %7) -> (tensor<2xf32>) {
+    %9 = vector.transfer_read %6[%c0], %0 {in_bounds = [true]} : tensor<2xf32>, vector<2xf32>
+    %10 = math.absf %9 : vector<2xf32>
+    %11 = vector.transfer_write %10, %arg1[%c0] {in_bounds = [true]} : vector<2xf32>, tensor<2xf32>
+    scf.forall.in_parallel {
+      tensor.parallel_insert_slice %11 into %arg1[0] [2] [1] : tensor<2xf32> into tensor<2xf32>
+    }
+  } {mapping = [#iree_codegen.workgroup_mapping<x>]}
+  iree_codegen.store_to_buffer %8, %5 : tensor<2xf32> into memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  return
+}
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert, {enable_loop_peeling}>} {
+  %0 = ub.poison : f32
+  %c0 = arith.constant 0 : index
+  %1 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+  %2 = arith.index_castui %1 : i32 to index
+  %3 = util.assume.int %2[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>] : index
+  %4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type<storage_buffer>>
+  %5 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%3) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  %6 = iree_codegen.load_from_buffer %4 : memref<2xf32, #hal.descriptor_type<storage_buffer>> -> tensor<2xf32>
+  %7 = tensor.empty() : tensor<2xf32>
+  %8 = scf.forall (%arg0) = (0) to (2) step (2) shared_outs(%arg1 = %7) -> (tensor<2xf32>) {
+    %9 = vector.transfer_read %6[%c0], %0 {in_bounds = [true]} : tensor<2xf32>, vector<2xf32>
+    %10 = math.absf %9 : vector<2xf32>
+    %11 = vector.transfer_write %10, %arg1[%c0] {in_bounds = [true]} : vector<2xf32>, tensor<2xf32>
+    scf.forall.in_parallel {
+      tensor.parallel_insert_slice %11 into %arg1[0] [2] [1] : tensor<2xf32> into tensor<2xf32>
+    }
+  } {mapping = [#iree_codegen.workgroup_mapping<x>]}
+  iree_codegen.store_to_buffer %8, %5 : tensor<2xf32> into memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  return
+}
+
+// -----// IR Dump After CSE (cse) //----- //
+func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert, {enable_loop_peeling}>} {
+  %0 = ub.poison : f32
+  %c0 = arith.constant 0 : index
+  %1 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+  %2 = arith.index_castui %1 : i32 to index
+  %3 = util.assume.int %2[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>] : index
+  %4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type<storage_buffer>>
+  %5 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%3) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  %6 = iree_codegen.load_from_buffer %4 : memref<2xf32, #hal.descriptor_type<storage_buffer>> -> tensor<2xf32>
+  %7 = tensor.empty() : tensor<2xf32>
+  %8 = scf.forall (%arg0) = (0) to (2) step (2) shared_outs(%arg1 = %7) -> (tensor<2xf32>) {
+    %9 = vector.transfer_read %6[%c0], %0 {in_bounds = [true]} : tensor<2xf32>, vector<2xf32>
+    %10 = math.absf %9 : vector<2xf32>
+    %11 = vector.transfer_write %10, %arg1[%c0] {in_bounds = [true]} : vector<2xf32>, tensor<2xf32>
+    scf.forall.in_parallel {
+      tensor.parallel_insert_slice %11 into %arg1[0] [2] [1] : tensor<2xf32> into tensor<2xf32>
+    }
+  } {mapping = [#iree_codegen.workgroup_mapping<x>]}
+  iree_codegen.store_to_buffer %8, %5 : tensor<2xf32> into memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  return
+}
+
+// -----// IR Dump After LLVMCPUVerifyVectorSizeLegalityPass (iree-llvmcpu-verify-vector-size-legality) //----- //
+func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert, {enable_loop_peeling}>} {
+  %0 = ub.poison : f32
+  %c0 = arith.constant 0 : index
+  %1 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+  %2 = arith.index_castui %1 : i32 to index
+  %3 = util.assume.int %2[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>] : index
+  %4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type<storage_buffer>>
+  %5 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%3) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  %6 = iree_codegen.load_from_buffer %4 : memref<2xf32, #hal.descriptor_type<storage_buffer>> -> tensor<2xf32>
+  %7 = tensor.empty() : tensor<2xf32>
+  %8 = scf.forall (%arg0) = (0) to (2) step (2) shared_outs(%arg1 = %7) -> (tensor<2xf32>) {
+    %9 = vector.transfer_read %6[%c0], %0 {in_bounds = [true]} : tensor<2xf32>, vector<2xf32>
+    %10 = math.absf %9 : vector<2xf32>
+    %11 = vector.transfer_write %10, %arg1[%c0] {in_bounds = [true]} : vector<2xf32>, tensor<2xf32>
+    scf.forall.in_parallel {
+      tensor.parallel_insert_slice %11 into %arg1[0] [2] [1] : tensor<2xf32> into tensor<2xf32>
+    }
+  } {mapping = [#iree_codegen.workgroup_mapping<x>]}
+  iree_codegen.store_to_buffer %8, %5 : tensor<2xf32> into memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  return
+}
+
+// -----// IR Dump After EliminateEmptyTensorsPass (iree-eliminate-empty-tensors) //----- //
+func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert, {enable_loop_peeling}>} {
+  %0 = ub.poison : f32
+  %c0 = arith.constant 0 : index
+  %1 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+  %2 = arith.index_castui %1 : i32 to index
+  %3 = util.assume.int %2[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>] : index
+  %4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type<storage_buffer>>
+  %5 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%3) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  %6 = iree_codegen.load_from_buffer %4 : memref<2xf32, #hal.descriptor_type<storage_buffer>> -> tensor<2xf32>
+  %7 = iree_codegen.load_from_buffer %5 : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>> -> tensor<2xf32>
+  %8 = tensor.empty() : tensor<2xf32>
+  %9 = scf.forall (%arg0) = (0) to (2) step (2) shared_outs(%arg1 = %7) -> (tensor<2xf32>) {
+    %10 = vector.transfer_read %6[%c0], %0 {in_bounds = [true]} : tensor<2xf32>, vector<2xf32>
+    %11 = math.absf %10 : vector<2xf32>
+    %12 = vector.transfer_write %11, %arg1[%c0] {in_bounds = [true]} : vector<2xf32>, tensor<2xf32>
+    scf.forall.in_parallel {
+      tensor.parallel_insert_slice %12 into %arg1[0] [2] [1] : tensor<2xf32> into tensor<2xf32>
+    }
+  } {mapping = [#iree_codegen.workgroup_mapping<x>]}
+  iree_codegen.store_to_buffer %9, %5 : tensor<2xf32> into memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  return
+}
+
+// -----// IR Dump After EmptyTensorToAllocTensorPass (empty-tensor-to-alloc-tensor) //----- //
+func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert, {enable_loop_peeling}>} {
+  %0 = ub.poison : f32
+  %c0 = arith.constant 0 : index
+  %1 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+  %2 = arith.index_castui %1 : i32 to index
+  %3 = util.assume.int %2[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>] : index
+  %4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type<storage_buffer>>
+  %5 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%3) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  %6 = iree_codegen.load_from_buffer %4 : memref<2xf32, #hal.descriptor_type<storage_buffer>> -> tensor<2xf32>
+  %7 = iree_codegen.load_from_buffer %5 : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>> -> tensor<2xf32>
+  %8 = scf.forall (%arg0) = (0) to (2) step (2) shared_outs(%arg1 = %7) -> (tensor<2xf32>) {
+    %9 = vector.transfer_read %6[%c0], %0 {in_bounds = [true]} : tensor<2xf32>, vector<2xf32>
+    %10 = math.absf %9 : vector<2xf32>
+    %11 = vector.transfer_write %10, %arg1[%c0] {in_bounds = [true]} : vector<2xf32>, tensor<2xf32>
+    scf.forall.in_parallel {
+      tensor.parallel_insert_slice %11 into %arg1[0] [2] [1] : tensor<2xf32> into tensor<2xf32>
+    }
+  } {mapping = [#iree_codegen.workgroup_mapping<x>]}
+  iree_codegen.store_to_buffer %8, %5 : tensor<2xf32> into memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  return
+}
+
+// -----// IR Dump After IREEComprehensiveBufferizePass (iree-codegen-iree-comprehensive-bufferize) //----- //
+func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert, {enable_loop_peeling}>} {
+  %0 = ub.poison : f32
+  %c0 = arith.constant 0 : index
+  %1 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+  %2 = arith.index_castui %1 : i32 to index
+  %3 = util.assume.int %2[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>] : index
+  %4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type<storage_buffer>>
+  %5 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%3) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  scf.forall (%arg0) = (0) to (2) step (2) {
+    %6 = vector.transfer_read %4[%c0], %0 {in_bounds = [true]} : memref<2xf32, #hal.descriptor_type<storage_buffer>>, vector<2xf32>
+    %7 = math.absf %6 : vector<2xf32>
+    vector.transfer_write %7, %5[%c0] {in_bounds = [true]} : vector<2xf32>, memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+    %subview = memref.subview %5[0] [2] [1] : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+    linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%5 : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    }
+  } {mapping = [#iree_codegen.workgroup_mapping<x>]}
+  linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%5 : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%5 : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
+  ^bb0(%in: f32, %out: f32):
+    linalg.yield %in : f32
+  }
+  return
+}
+
+// -----// IR Dump After IREEInjectAssumeAlignmentPass (iree-codegen-inject-assume-alignment) //----- //
+func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert, {enable_loop_peeling}>} {
+  %0 = ub.poison : f32
+  %c0 = arith.constant 0 : index
+  %1 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+  %2 = arith.index_castui %1 : i32 to index
+  %3 = util.assume.int %2[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>] : index
+  %4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type<storage_buffer>>
+  %assume_align = memref.assume_alignment %4, 64 : memref<2xf32, #hal.descriptor_type<storage_buffer>>
+  %5 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%3) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  %assume_align_0 = memref.assume_alignment %5, 64 : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  scf.forall (%arg0) = (0) to (2) step (2) {
+    %6 = vector.transfer_read %assume_align[%c0], %0 {in_bounds = [true]} : memref<2xf32, #hal.descriptor_type<storage_buffer>>, vector<2xf32>
+    %7 = math.absf %6 : vector<2xf32>
+    vector.transfer_write %7, %assume_align_0[%c0] {in_bounds = [true]} : vector<2xf32>, memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+    %subview = memref.subview %assume_align_0[0] [2] [1] : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+    linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%assume_align_0 : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    }
+  } {mapping = [#iree_codegen.workgroup_mapping<x>]}
+  linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%assume_align_0 : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%assume_align_0 : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
+  ^bb0(%in: f32, %out: f32):
+    linalg.yield %in : f32
+  }
+  return
+}
+
+// -----// IR Dump After ResolveShapedTypeResultDimsPass (resolve-shaped-type-result-dims) //----- //
+func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert, {enable_loop_peeling}>} {
+  %0 = ub.poison : f32
+  %c0 = arith.constant 0 : index
+  %1 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+  %2 = arith.index_castui %1 : i32 to index
+  %3 = util.assume.int %2[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>] : index
+  %4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type<storage_buffer>>
+  %assume_align = memref.assume_alignment %4, 64 : memref<2xf32, #hal.descriptor_type<storage_buffer>>
+  %5 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%3) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  %assume_align_0 = memref.assume_alignment %5, 64 : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  scf.forall (%arg0) = (0) to (2) step (2) {
+    %6 = vector.transfer_read %assume_align[%c0], %0 {in_bounds = [true]} : memref<2xf32, #hal.descriptor_type<storage_buffer>>, vector<2xf32>
+    %7 = math.absf %6 : vector<2xf32>
+    vector.transfer_write %7, %assume_align_0[%c0] {in_bounds = [true]} : vector<2xf32>, memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+    %subview = memref.subview %assume_align_0[0] [2] [1] : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+    linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%assume_align_0 : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    }
+  } {mapping = [#iree_codegen.workgroup_mapping<x>]}
+  linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%assume_align_0 : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%assume_align_0 : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
+  ^bb0(%in: f32, %out: f32):
+    linalg.yield %in : f32
+  }
+  return
+}
+
+// -----// IR Dump After IREECodegenCanonicalizerPass (iree-codegen-canonicalize) //----- //
+func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert, {enable_loop_peeling}>} {
+  %0 = ub.poison : f32
+  %c0 = arith.constant 0 : index
+  %1 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+  %2 = arith.index_castui %1 : i32 to index
+  %3 = util.assume.int %2[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>] : index
+  %4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type<storage_buffer>>
+  %assume_align = memref.assume_alignment %4, 64 : memref<2xf32, #hal.descriptor_type<storage_buffer>>
+  %5 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%3) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  %assume_align_0 = memref.assume_alignment %5, 64 : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  scf.forall (%arg0) = (0) to (2) step (2) {
+    %6 = vector.transfer_read %assume_align[%c0], %0 {in_bounds = [true]} : memref<2xf32, #hal.descriptor_type<storage_buffer>>, vector<2xf32>
+    %7 = math.absf %6 : vector<2xf32>
+    vector.transfer_write %7, %assume_align_0[%c0] {in_bounds = [true]} : vector<2xf32>, memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  } {mapping = [#iree_codegen.workgroup_mapping<x>]}
+  return
+}
+
+// -----// IR Dump After CSE (cse) //----- //
+func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert, {enable_loop_peeling}>} {
+  %0 = ub.poison : f32
+  %c0 = arith.constant 0 : index
+  %1 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+  %2 = arith.index_castui %1 : i32 to index
+  %3 = util.assume.int %2[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>] : index
+  %4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type<storage_buffer>>
+  %assume_align = memref.assume_alignment %4, 64 : memref<2xf32, #hal.descriptor_type<storage_buffer>>
+  %5 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%3) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  %assume_align_0 = memref.assume_alignment %5, 64 : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  scf.forall (%arg0) = (0) to (2) step (2) {
+    %6 = vector.transfer_read %assume_align[%c0], %0 {in_bounds = [true]} : memref<2xf32, #hal.descriptor_type<storage_buffer>>, vector<2xf32>
+    %7 = math.absf %6 : vector<2xf32>
+    vector.transfer_write %7, %assume_align_0[%c0] {in_bounds = [true]} : vector<2xf32>, memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  } {mapping = [#iree_codegen.workgroup_mapping<x>]}
+  return
+}
+
+// -----// IR Dump After IREECodegenCanonicalizerPass (iree-codegen-canonicalize) //----- //
+func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert, {enable_loop_peeling}>} {
+  %0 = ub.poison : f32
+  %c0 = arith.constant 0 : index
+  %1 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+  %2 = arith.index_castui %1 : i32 to index
+  %3 = util.assume.int %2[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>] : index
+  %4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type<storage_buffer>>
+  %assume_align = memref.assume_alignment %4, 64 : memref<2xf32, #hal.descriptor_type<storage_buffer>>
+  %5 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%3) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  %assume_align_0 = memref.assume_alignment %5, 64 : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  scf.forall (%arg0) = (0) to (2) step (2) {
+    %6 = vector.transfer_read %assume_align[%c0], %0 {in_bounds = [true]} : memref<2xf32, #hal.descriptor_type<storage_buffer>>, vector<2xf32>
+    %7 = math.absf %6 : vector<2xf32>
+    vector.transfer_write %7, %assume_align_0[%c0] {in_bounds = [true]} : vector<2xf32>, memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  } {mapping = [#iree_codegen.workgroup_mapping<x>]}
+  return
+}
+
+// -----// IR Dump After CleanupBufferAllocViewPass (iree-codegen-cleanup-buffer-alloc-view) //----- //
+func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert, {enable_loop_peeling}>} {
+  %0 = ub.poison : f32
+  %c0 = arith.constant 0 : index
+  %1 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+  %2 = arith.index_castui %1 : i32 to index
+  %3 = util.assume.int %2[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>] : index
+  %4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type<storage_buffer>>
+  %assume_align = memref.assume_alignment %4, 64 : memref<2xf32, #hal.descriptor_type<storage_buffer>>
+  %5 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%3) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  %assume_align_0 = memref.assume_alignment %5, 64 : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  scf.forall (%arg0) = (0) to (2) step (2) {
+    %6 = vector.transfer_read %assume_align[%c0], %0 {in_bounds = [true]} : memref<2xf32, #hal.descriptor_type<storage_buffer>>, vector<2xf32>
+    %7 = math.absf %6 : vector<2xf32>
+    vector.transfer_write %7, %assume_align_0[%c0] {in_bounds = [true]} : vector<2xf32>, memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  } {mapping = [#iree_codegen.workgroup_mapping<x>]}
+  return
+}
+
+// -----// IR Dump After PropagateDispatchSizeBoundsPass (iree-codegen-propagate-dispatch-size-bounds) //----- //
+func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert, {enable_loop_peeling}>} {
+  %0 = ub.poison : f32
+  %c0 = arith.constant 0 : index
+  %1 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+  %2 = arith.index_castui %1 : i32 to index
+  %3 = util.assume.int %2[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>] : index
+  %4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type<storage_buffer>>
+  %assume_align = memref.assume_alignment %4, 64 : memref<2xf32, #hal.descriptor_type<storage_buffer>>
+  %5 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%3) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  %assume_align_0 = memref.assume_alignment %5, 64 : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  scf.forall (%arg0) = (0) to (2) step (2) {
+    %6 = vector.transfer_read %assume_align[%c0], %0 {in_bounds = [true]} : memref<2xf32, #hal.descriptor_type<storage_buffer>>, vector<2xf32>
+    %7 = math.absf %6 : vector<2xf32>
+    vector.transfer_write %7, %assume_align_0[%c0] {in_bounds = [true]} : vector<2xf32>, memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  } {mapping = [#iree_codegen.workgroup_mapping<x>]}
+  return
+}
+
+// -----// IR Dump After RemoveSingleIterationLoopPass (iree-codegen-remove-single-iteration-loop) //----- //
+func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert, {enable_loop_peeling}>} {
+  %0 = ub.poison : f32
+  %c0 = arith.constant 0 : index
+  %1 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+  %2 = arith.index_castui %1 : i32 to index
+  %3 = util.assume.int %2[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>] : index
+  %4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type<storage_buffer>>
+  %assume_align = memref.assume_alignment %4, 64 : memref<2xf32, #hal.descriptor_type<storage_buffer>>
+  %5 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%3) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  %assume_align_0 = memref.assume_alignment %5, 64 : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  scf.forall (%arg0) = (0) to (2) step (2) {
+    %6 = vector.transfer_read %assume_align[%c0], %0 {in_bounds = [true]} : memref<2xf32, #hal.descriptor_type<storage_buffer>>, vector<2xf32>
+    %7 = math.absf %6 : vector<2xf32>
+    vector.transfer_write %7, %assume_align_0[%c0] {in_bounds = [true]} : vector<2xf32>, memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  } {mapping = [#iree_codegen.workgroup_mapping<x>]}
+  return
+}
+
+// -----// IR Dump After DropVectorUnitDimsPass (iree-codegen-drop-vector-unit-dims) //----- //
+func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert, {enable_loop_peeling}>} {
+  %0 = ub.poison : f32
+  %c0 = arith.constant 0 : index
+  %1 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+  %2 = arith.index_castui %1 : i32 to index
+  %3 = util.assume.int %2[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>] : index
+  %4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type<storage_buffer>>
+  %assume_align = memref.assume_alignment %4, 64 : memref<2xf32, #hal.descriptor_type<storage_buffer>>
+  %5 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%3) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  %assume_align_0 = memref.assume_alignment %5, 64 : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  scf.forall (%arg0) = (0) to (2) step (2) {
+    %6 = vector.transfer_read %assume_align[%c0], %0 {in_bounds = [true]} : memref<2xf32, #hal.descriptor_type<storage_buffer>>, vector<2xf32>
+    %7 = math.absf %6 : vector<2xf32>
+    vector.transfer_write %7, %assume_align_0[%c0] {in_bounds = [true]} : vector<2xf32>, memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  } {mapping = [#iree_codegen.workgroup_mapping<x>]}
+  return
+}
+
+// -----// IR Dump After LLVMCPUVirtualVectorLoweringPass (iree-llvmcpu-virtual-vector-lowering) //----- //
+func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert, {enable_loop_peeling}>} {
+  %0 = ub.poison : f32
+  %c0 = arith.constant 0 : index
+  %1 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+  %2 = arith.index_castui %1 : i32 to index
+  %3 = util.assume.int %2[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>] : index
+  %4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type<storage_buffer>>
+  %assume_align = memref.assume_alignment %4, 64 : memref<2xf32, #hal.descriptor_type<storage_buffer>>
+  %5 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%3) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  %assume_align_0 = memref.assume_alignment %5, 64 : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  scf.forall (%arg0) = (0) to (2) step (2) {
+    %6 = vector.transfer_read %assume_align[%c0], %0 {in_bounds = [true]} : memref<2xf32, #hal.descriptor_type<storage_buffer>>, vector<2xf32>
+    %7 = math.absf %6 : vector<2xf32>
+    vector.transfer_write %7, %assume_align_0[%c0] {in_bounds = [true]} : vector<2xf32>, memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  } {mapping = [#iree_codegen.workgroup_mapping<x>]}
+  return
+}
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert, {enable_loop_peeling}>} {
+  %0 = ub.poison : f32
+  %c0 = arith.constant 0 : index
+  %1 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+  %2 = arith.index_castui %1 : i32 to index
+  %3 = util.assume.int %2[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>] : index
+  %4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type<storage_buffer>>
+  %assume_align = memref.assume_alignment %4, 64 : memref<2xf32, #hal.descriptor_type<storage_buffer>>
+  %5 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%3) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  %assume_align_0 = memref.assume_alignment %5, 64 : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  scf.forall (%arg0) = (0) to (2) step (2) {
+    %6 = vector.transfer_read %assume_align[%c0], %0 {in_bounds = [true]} : memref<2xf32, #hal.descriptor_type<storage_buffer>>, vector<2xf32>
+    %7 = math.absf %6 : vector<2xf32>
+    vector.transfer_write %7, %assume_align_0[%c0] {in_bounds = [true]} : vector<2xf32>, memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  } {mapping = [#iree_codegen.workgroup_mapping<x>]}
+  return
+}
+
+// -----// IR Dump After VectorTransferLoweringPass (iree-codegen-vector-transfer-lowering) //----- //
+func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert, {enable_loop_peeling}>} {
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+  %1 = arith.index_castui %0 : i32 to index
+  %2 = util.assume.int %1[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>] : index
+  %3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type<storage_buffer>>
+  %assume_align = memref.assume_alignment %3, 64 : memref<2xf32, #hal.descriptor_type<storage_buffer>>
+  %4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  %assume_align_0 = memref.assume_alignment %4, 64 : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  scf.forall (%arg0) = (0) to (2) step (2) {
+    %5 = vector.load %assume_align[%c0] : memref<2xf32, #hal.descriptor_type<storage_buffer>>, vector<2xf32>
+    %6 = math.absf %5 : vector<2xf32>
+    vector.store %6, %assume_align_0[%c0] : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<2xf32>
+  } {mapping = [#iree_codegen.workgroup_mapping<x>]}
+  return
+}
+
+// -----// IR Dump After LLVMCPUVectorTransposeLoweringPass (iree-llvmcpu-vector-transpose-lowering) //----- //
+func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert, {enable_loop_peeling}>} {
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+  %1 = arith.index_castui %0 : i32 to index
+  %2 = util.assume.int %1[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>] : index
+  %3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type<storage_buffer>>
+  %assume_align = memref.assume_alignment %3, 64 : memref<2xf32, #hal.descriptor_type<storage_buffer>>
+  %4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  %assume_align_0 = memref.assume_alignment %4, 64 : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  scf.forall (%arg0) = (0) to (2) step (2) {
+    %5 = vector.load %assume_align[%c0] : memref<2xf32, #hal.descriptor_type<storage_buffer>>, vector<2xf32>
+    %6 = math.absf %5 : vector<2xf32>
+    vector.store %6, %assume_align_0[%c0] : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<2xf32>
+  } {mapping = [#iree_codegen.workgroup_mapping<x>]}
+  return
+}
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert, {enable_loop_peeling}>} {
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+  %1 = arith.index_castui %0 : i32 to index
+  %2 = util.assume.int %1[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>] : index
+  %3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type<storage_buffer>>
+  %assume_align = memref.assume_alignment %3, 64 : memref<2xf32, #hal.descriptor_type<storage_buffer>>
+  %4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  %assume_align_0 = memref.assume_alignment %4, 64 : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  scf.forall (%arg0) = (0) to (2) step (2) {
+    %5 = vector.load %assume_align[%c0] : memref<2xf32, #hal.descriptor_type<storage_buffer>>, vector<2xf32>
+    %6 = math.absf %5 : vector<2xf32>
+    vector.store %6, %assume_align_0[%c0] : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<2xf32>
+  } {mapping = [#iree_codegen.workgroup_mapping<x>]}
+  return
+}
+
+// -----// IR Dump After LLVMCPUVectorShapeCastLoweringPass (iree-llvmcpu-vector-shape-cast-lowering) //----- //
+func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert, {enable_loop_peeling}>} {
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+  %1 = arith.index_castui %0 : i32 to index
+  %2 = util.assume.int %1[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>] : index
+  %3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type<storage_buffer>>
+  %assume_align = memref.assume_alignment %3, 64 : memref<2xf32, #hal.descriptor_type<storage_buffer>>
+  %4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  %assume_align_0 = memref.assume_alignment %4, 64 : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  scf.forall (%arg0) = (0) to (2) step (2) {
+    %5 = vector.load %assume_align[%c0] : memref<2xf32, #hal.descriptor_type<storage_buffer>>, vector<2xf32>
+    %6 = math.absf %5 : vector<2xf32>
+    vector.store %6, %assume_align_0[%c0] : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<2xf32>
+  } {mapping = [#iree_codegen.workgroup_mapping<x>]}
+  return
+}
+
+// -----// IR Dump After LLVMCPULowerExecutableTargetPass (iree-llvmcpu-lower-executable-target) //----- //
+func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert, {enable_loop_peeling}>} {
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+  %1 = arith.index_castui %0 : i32 to index
+  %2 = util.assume.int %1[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>] : index
+  %3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type<storage_buffer>>
+  %assume_align = memref.assume_alignment %3, 64 : memref<2xf32, #hal.descriptor_type<storage_buffer>>
+  %4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  %assume_align_0 = memref.assume_alignment %4, 64 : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  scf.forall (%arg0) = (0) to (2) step (2) {
+    %5 = vector.load %assume_align[%c0] : memref<2xf32, #hal.descriptor_type<storage_buffer>>, vector<2xf32>
+    %6 = math.absf %5 : vector<2xf32>
+    vector.store %6, %assume_align_0[%c0] : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<2xf32>
+  } {mapping = [#iree_codegen.workgroup_mapping<x>]}
+  return
+}
+
+// -----// IR Dump After VerifyWorkgroupDistributionPass (iree-codegen-verify-workgroup-distribution) //----- //
+func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert, {enable_loop_peeling}>} {
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+  %1 = arith.index_castui %0 : i32 to index
+  %2 = util.assume.int %1[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>] : index
+  %3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type<storage_buffer>>
+  %assume_align = memref.assume_alignment %3, 64 : memref<2xf32, #hal.descriptor_type<storage_buffer>>
+  %4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  %assume_align_0 = memref.assume_alignment %4, 64 : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+  scf.forall (%arg0) = (0) to (2) step (2) {
+    %5 = vector.load %assume_align[%c0] : memref<2xf32, #hal.descriptor_type<storage_buffer>>, vector<2xf32>
+    %6 = math.absf %5 : vector<2xf32>
+    vector.store %6, %assume_align_0[%c0] : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<2xf32>
+  } {mapping = [#iree_codegen.workgroup_mapping<x>]}
+  return
+}
+
+// -----// IR Dump After ReconcileTranslationInfoPass (iree-codegen-reconcile-translation-info) //----- //
+hal.executable.variant public @embedded_elf_arm_64 target(<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>) {
+  hal.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 ordinal(0) layout(#hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) count(%arg0: !hal.device) -> (index, index, index) {
+    %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+    hal.return %x, %y, %z : index, index, index
+  } attributes {workgroup_size = [1 : index, 1 : index, 1 : index]}
+  builtin.module {
+    func.func @multiple_results_dispatch_0_elementwise_2_f32() {
+      %c0 = arith.constant 0 : index
+      %0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+      %1 = arith.index_castui %0 : i32 to index
+      %2 = util.assume.int %1[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>] : index
+      %3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type<storage_buffer>>
+      %assume_align = memref.assume_alignment %3, 64 : memref<2xf32, #hal.descriptor_type<storage_buffer>>
+      %4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+      %assume_align_0 = memref.assume_alignment %4, 64 : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+      %workgroup_id_x = hal.interface.workgroup.id[0] : index
+      %workgroup_count_x = hal.interface.workgroup.count[0] : index
+      %5 = affine.apply affine_map<()[s0] -> (s0 * 2)>()[%workgroup_id_x]
+      %6 = vector.load %assume_align[%c0] : memref<2xf32, #hal.descriptor_type<storage_buffer>>, vector<2xf32>
+      %7 = math.absf %6 : vector<2xf32>
+      vector.store %7, %assume_align_0[%c0] : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<2xf32>
+      iree_codegen.workgroup_count_hint(1)
+      return
+    }
+  }
+}
+
+// -----// IR Dump After ResolveWorkgroupCountHintsPass (iree-codegen-resolve-workgroup-count-hints) //----- //
+hal.executable.variant public @embedded_elf_arm_64 target(<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>) {
+  hal.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 ordinal(0) layout(#hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) count(%arg0: !hal.device) -> (index, index, index) {
+    %c1 = arith.constant 1 : index
+    %c1_0 = arith.constant 1 : index
+    %c1_1 = arith.constant 1 : index
+    hal.return %c1, %c1_0, %c1_1 : index, index, index
+  } attributes {workgroup_size = [1 : index, 1 : index, 1 : index]}
+  builtin.module {
+    func.func @multiple_results_dispatch_0_elementwise_2_f32() {
+      %c0 = arith.constant 0 : index
+      %0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+      %1 = arith.index_castui %0 : i32 to index
+      %2 = util.assume.int %1[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>] : index
+      %3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type<storage_buffer>>
+      %assume_align = memref.assume_alignment %3, 64 : memref<2xf32, #hal.descriptor_type<storage_buffer>>
+      %4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+      %assume_align_0 = memref.assume_alignment %4, 64 : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+      %workgroup_id_x = hal.interface.workgroup.id[0] : index
+      %workgroup_count_x = hal.interface.workgroup.count[0] : index
+      %5 = affine.apply affine_map<()[s0] -> (s0 * 2)>()[%workgroup_id_x]
+      %6 = vector.load %assume_align[%c0] : memref<2xf32, #hal.descriptor_type<storage_buffer>>, vector<2xf32>
+      %7 = math.absf %6 : vector<2xf32>
+      vector.store %7, %assume_align_0[%c0] : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<2xf32>
+      return
+    }
+  }
+}
+
+// -----// IR Dump After IREECodegenLowerAffinePass (iree-codegen-lower-affine) //----- //
+hal.executable.variant public @embedded_elf_arm_64 target(<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>) {
+  hal.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 ordinal(0) layout(#hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) count(%arg0: !hal.device) -> (index, index, index) {
+    %c1 = arith.constant 1 : index
+    %c1_0 = arith.constant 1 : index
+    %c1_1 = arith.constant 1 : index
+    hal.return %c1, %c1_0, %c1_1 : index, index, index
+  } attributes {workgroup_size = [1 : index, 1 : index, 1 : index]}
+  builtin.module {
+    func.func @multiple_results_dispatch_0_elementwise_2_f32() {
+      %c0 = arith.constant 0 : index
+      %0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+      %1 = arith.index_castui %0 : i32 to index
+      %2 = util.assume.int %1[<umin = 0, umax = 0>, <umin = 64, umax = 64, udiv = 64>] : index
+      %3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type<storage_buffer>>
+      %assume_align = memref.assume_alignment %3, 64 : memref<2xf32, #hal.descriptor_type<storage_buffer>>
+      %4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+      %assume_align_0 = memref.assume_alignment %4, 64 : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+      %workgroup_id_x = hal.interface.workgroup.id[0] : index
+      %workgroup_count_x = hal.interface.workgroup.count[0] : index
+      %c2 = arith.constant 2 : index
+      %5 = arith.muli %workgroup_id_x, %c2 overflow<nsw> : index
+      %6 = vector.load %assume_align[%c0] : memref<2xf32, #hal.descriptor_type<storage_buffer>>, vector<2xf32>
+      %7 = math.absf %6 : vector<2xf32>
+      vector.store %7, %assume_align_0[%c0] : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<2xf32>
+      return
+    }
+  }
+}
+
+// -----// IR Dump After DropCompilerHintsPass (iree-util-drop-compiler-hints) //----- //
+hal.executable.variant public @embedded_elf_arm_64 target(<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>) {
+  hal.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 ordinal(0) layout(#hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) count(%arg0: !hal.device) -> (index, index, index) {
+    %c1 = arith.constant 1 : index
+    %c1_0 = arith.constant 1 : index
+    %c1_1 = arith.constant 1 : index
+    hal.return %c1, %c1_0, %c1_1 : index, index, index
+  } attributes {workgroup_size = [1 : index, 1 : index, 1 : index]}
+  builtin.module {
+    func.func @multiple_results_dispatch_0_elementwise_2_f32() {
+      %c0 = arith.constant 0 : index
+      %0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+      %1 = arith.index_castui %0 : i32 to index
+      %2 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type<storage_buffer>>
+      %assume_align = memref.assume_alignment %2, 64 : memref<2xf32, #hal.descriptor_type<storage_buffer>>
+      %3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%1) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+      %assume_align_0 = memref.assume_alignment %3, 64 : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
+      %workgroup_id_x = hal.interface.workgroup.id[0] : index
+      %workgroup_count_x = hal.interface.workgroup.count[0] : index
+      %c2 = arith.constant 2 : index
+      %4 = arith.muli %workgroup_id_x, %c2 overflow<nsw> : index
+      %5 = vector.load %assume_align[%c0] : memref<2xf32, #hal.descriptor_type<storage_buffer>>, vector<2xf32>
+      %6 = math.absf %5 : vector<2xf32>
+      vector.store %6, %assume_align_0[%c0] : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<2xf32>
+      return
+    }
+  }
+}
+
+// -----// IR Dump After EraseHALDescriptorTypeFromMemRefPass (iree-codegen-erase-hal-descriptor-type-from-memref) //----- //
+func.func @multiple_results_dispatch_0_elementwise_2_f32() {
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+  %1 = arith.index_castui %0 : i32 to index
+  %2 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32>
+  %assume_align = memref.assume_alignment %2, 64 : memref<2xf32>
+  %3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%1) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>>
+  %assume_align_0 = memref.assume_alignment %3, 64 : memref<2xf32, strided<[1], offset: ?>>
+  %workgroup_id_x = hal.interface.workgroup.id[0] : index
+  %workgroup_count_x = hal.interface.workgroup.count[0] : index
+  %c2 = arith.constant 2 : index
+  %4 = arith.muli %workgroup_id_x, %c2 overflow<nsw> : index
+  %5 = vector.load %assume_align[%c0] : memref<2xf32>, vector<2xf32>
+  %6 = math.absf %5 : vector<2xf32>
+  vector.store %6, %assume_align_0[%c0] : memref<2xf32, strided<[1], offset: ?>>, vector<2xf32>
+  return
+}
+
+// -----// IR Dump After LowerUKernelOpsToCallsPass (iree-codegen-lower-ukernel-ops-to-calls) //----- //
+module {
+  func.func @multiple_results_dispatch_0_elementwise_2_f32() {
+    %c0 = arith.constant 0 : index
+    %0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+    %1 = arith.index_castui %0 : i32 to index
+    %2 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32>
+    %assume_align = memref.assume_alignment %2, 64 : memref<2xf32>
+    %3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%1) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>>
+    %assume_align_0 = memref.assume_alignment %3, 64 : memref<2xf32, strided<[1], offset: ?>>
+    %workgroup_id_x = hal.interface.workgroup.id[0] : index
+    %workgroup_count_x = hal.interface.workgroup.count[0] : index
+    %c2 = arith.constant 2 : index
+    %4 = arith.muli %workgroup_id_x, %c2 overflow<nsw> : index
+    %5 = vector.load %assume_align[%c0] : memref<2xf32>, vector<2xf32>
+    %6 = math.absf %5 : vector<2xf32>
+    vector.store %6, %assume_align_0[%c0] : memref<2xf32, strided<[1], offset: ?>>, vector<2xf32>
+    return
+  }
+}
+
+// -----// IR Dump After LinalgExtToLoopsPass (iree-linalg-ext-to-loops) //----- //
+func.func @multiple_results_dispatch_0_elementwise_2_f32() {
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+  %1 = arith.index_castui %0 : i32 to index
+  %2 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32>
+  %assume_align = memref.assume_alignment %2, 64 : memref<2xf32>
+  %3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%1) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>>
+  %assume_align_0 = memref.assume_alignment %3, 64 : memref<2xf32, strided<[1], offset: ?>>
+  %4 = vector.load %assume_align[%c0] : memref<2xf32>, vector<2xf32>
+  %5 = math.absf %4 : vector<2xf32>
+  vector.store %5, %assume_align_0[%c0] : memref<2xf32, strided<[1], offset: ?>>, vector<2xf32>
+  return
+}
+
+// -----// IR Dump After MemrefCopyToLinalgPass (iree-codegen-memrefcopy-to-linalg) //----- //
+func.func @multiple_results_dispatch_0_elementwise_2_f32() {
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+  %1 = arith.index_castui %0 : i32 to index
+  %2 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32>
+  %assume_align = memref.assume_alignment %2, 64 : memref<2xf32>
+  %3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%1) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>>
+  %assume_align_0 = memref.assume_alignment %3, 64 : memref<2xf32, strided<[1], offset: ?>>
+  %4 = vector.load %assume_align[%c0] : memref<2xf32>, vector<2xf32>
+  %5 = math.absf %4 : vector<2xf32>
+  vector.store %5, %assume_align_0[%c0] : memref<2xf32, strided<[1], offset: ?>>, vector<2xf32>
+  return
+}
+
+// -----// IR Dump After ConvertLinalgToLoopsPass (convert-linalg-to-loops) //----- //
+func.func @multiple_results_dispatch_0_elementwise_2_f32() {
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+  %1 = arith.index_castui %0 : i32 to index
+  %2 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32>
+  %assume_align = memref.assume_alignment %2, 64 : memref<2xf32>
+  %3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%1) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>>
+  %assume_align_0 = memref.assume_alignment %3, 64 : memref<2xf32, strided<[1], offset: ?>>
+  %4 = vector.load %assume_align[%c0] : memref<2xf32>, vector<2xf32>
+  %5 = math.absf %4 : vector<2xf32>
+  vector.store %5, %assume_align_0[%c0] : memref<2xf32, strided<[1], offset: ?>>, vector<2xf32>
+  return
+}
+
+// -----// IR Dump After ConvertBf16ArithToF32Pass (iree-convert-bf16-arith-to-f32) //----- //
+func.func @multiple_results_dispatch_0_elementwise_2_f32() {
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+  %1 = arith.index_castui %0 : i32 to index
+  %2 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32>
+  %assume_align = memref.assume_alignment %2, 64 : memref<2xf32>
+  %3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%1) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>>
+  %assume_align_0 = memref.assume_alignment %3, 64 : memref<2xf32, strided<[1], offset: ?>>
+  %4 = vector.load %assume_align[%c0] : memref<2xf32>, vector<2xf32>
+  %5 = math.absf %4 : vector<2xf32>
+  vector.store %5, %assume_align_0[%c0] : memref<2xf32, strided<[1], offset: ?>>, vector<2xf32>
+  return
+}
+
+// -----// IR Dump After ConvertBf16ToUInt16BuffersPass (iree-codegen-convert-bf16-to-uint16-buffers) //----- //
+func.func @multiple_results_dispatch_0_elementwise_2_f32() {
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+  %1 = arith.index_castui %0 : i32 to index
+  %2 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32>
+  %assume_align = memref.assume_alignment %2, 64 : memref<2xf32>
+  %3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%1) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>>
+  %assume_align_0 = memref.assume_alignment %3, 64 : memref<2xf32, strided<[1], offset: ?>>
+  %4 = vector.load %assume_align[%c0] : memref<2xf32>, vector<2xf32>
+  %5 = math.absf %4 : vector<2xf32>
+  vector.store %5, %assume_align_0[%c0] : memref<2xf32, strided<[1], offset: ?>>, vector<2xf32>
+  return
+}
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+func.func @multiple_results_dispatch_0_elementwise_2_f32() {
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+  %1 = arith.index_castui %0 : i32 to index
+  %2 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32>
+  %assume_align = memref.assume_alignment %2, 64 : memref<2xf32>
+  %3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%1) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>>
+  %assume_align_0 = memref.assume_alignment %3, 64 : memref<2xf32, strided<[1], offset: ?>>
+  %4 = vector.load %assume_align[%c0] : memref<2xf32>, vector<2xf32>
+  %5 = math.absf %4 : vector<2xf32>
+  vector.store %5, %assume_align_0[%c0] : memref<2xf32, strided<[1], offset: ?>>, vector<2xf32>
+  return
+}
+
+// -----// IR Dump After CSE (cse) //----- //
+func.func @multiple_results_dispatch_0_elementwise_2_f32() {
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+  %1 = arith.index_castui %0 : i32 to index
+  %2 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32>
+  %assume_align = memref.assume_alignment %2, 64 : memref<2xf32>
+  %3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%1) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>>
+  %assume_align_0 = memref.assume_alignment %3, 64 : memref<2xf32, strided<[1], offset: ?>>
+  %4 = vector.load %assume_align[%c0] : memref<2xf32>, vector<2xf32>
+  %5 = math.absf %4 : vector<2xf32>
+  vector.store %5, %assume_align_0[%c0] : memref<2xf32, strided<[1], offset: ?>>, vector<2xf32>
+  return
+}
+
+// -----// IR Dump After IREEBufferizeConstantsPass (iree-codegen-iree-bufferize-constants) //----- //
+module {
+  func.func @multiple_results_dispatch_0_elementwise_2_f32() {
+    %c0 = arith.constant 0 : index
+    %0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+    %1 = arith.index_castui %0 : i32 to index
+    %2 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32>
+    %assume_align = memref.assume_alignment %2, 64 : memref<2xf32>
+    %3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%1) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>>
+    %assume_align_0 = memref.assume_alignment %3, 64 : memref<2xf32, strided<[1], offset: ?>>
+    %4 = vector.load %assume_align[%c0] : memref<2xf32>, vector<2xf32>
+    %5 = math.absf %4 : vector<2xf32>
+    vector.store %5, %assume_align_0[%c0] : memref<2xf32, strided<[1], offset: ?>>, vector<2xf32>
+    return
+  }
+}
+
+// -----// IR Dump After FoldTensorExtractOpPass (iree-codegen-fold-tensor-extract-op) //----- //
+func.func @multiple_results_dispatch_0_elementwise_2_f32() {
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+  %1 = arith.index_castui %0 : i32 to index
+  %2 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32>
+  %assume_align = memref.assume_alignment %2, 64 : memref<2xf32>
+  %3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%1) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>>
+  %assume_align_0 = memref.assume_alignment %3, 64 : memref<2xf32, strided<[1], offset: ?>>
+  %4 = vector.load %assume_align[%c0] : memref<2xf32>, vector<2xf32>
+  %5 = math.absf %4 : vector<2xf32>
+  vector.store %5, %assume_align_0[%c0] : memref<2xf32, strided<[1], offset: ?>>, vector<2xf32>
+  return
+}
+
+// -----// IR Dump After ConvertComplexToStandardPass (convert-complex-to-standard) //----- //
+func.func @multiple_results_dispatch_0_elementwise_2_f32() {
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+  %1 = arith.index_castui %0 : i32 to index
+  %2 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32>
+  %assume_align = memref.assume_alignment %2, 64 : memref<2xf32>
+  %3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%1) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>>
+  %assume_align_0 = memref.assume_alignment %3, 64 : memref<2xf32, strided<[1], offset: ?>>
+  %4 = vector.load %assume_align[%c0] : memref<2xf32>, vector<2xf32>
+  %5 = math.absf %4 : vector<2xf32>
+  vector.store %5, %assume_align_0[%c0] : memref<2xf32, strided<[1], offset: ?>>, vector<2xf32>
+  return
+}
+
+// -----// IR Dump After MathTransformPass (iree-codegen-math-transform) //----- //
+func.func @multiple_results_dispatch_0_elementwise_2_f32() {
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+  %1 = arith.index_castui %0 : i32 to index
+  %2 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32>
+  %assume_align = memref.assume_alignment %2, 64 : memref<2xf32>
+  %3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%1) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>>
+  %assume_align_0 = memref.assume_alignment %3, 64 : memref<2xf32, strided<[1], offset: ?>>
+  %4 = vector.load %assume_align[%c0] : memref<2xf32>, vector<2xf32>
+  %5 = math.absf %4 : vector<2xf32>
+  vector.store %5, %assume_align_0[%c0] : memref<2xf32, strided<[1], offset: ?>>, vector<2xf32>
+  return
+}
+
+// -----// IR Dump After HoistStaticallyBoundAllocationsPass (iree-codegen-hoist-statically-bound-allocations) //----- //
+func.func @multiple_results_dispatch_0_elementwise_2_f32() {
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+  %1 = arith.index_castui %0 : i32 to index
+  %2 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32>
+  %assume_align = memref.assume_alignment %2, 64 : memref<2xf32>
+  %3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%1) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>>
+  %assume_align_0 = memref.assume_alignment %3, 64 : memref<2xf32, strided<[1], offset: ?>>
+  %4 = vector.load %assume_align[%c0] : memref<2xf32>, vector<2xf32>
+  %5 = math.absf %4 : vector<2xf32>
+  vector.store %5, %assume_align_0[%c0] : memref<2xf32, strided<[1], offset: ?>>, vector<2xf32>
+  return
+}
+
+// -----// IR Dump After VectorTransferLoweringPass (iree-codegen-vector-transfer-lowering) //----- //
+func.func @multiple_results_dispatch_0_elementwise_2_f32() {
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+  %1 = arith.index_castui %0 : i32 to index
+  %2 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32>
+  %assume_align = memref.assume_alignment %2, 64 : memref<2xf32>
+  %3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%1) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>>
+  %assume_align_0 = memref.assume_alignment %3, 64 : memref<2xf32, strided<[1], offset: ?>>
+  %4 = vector.load %assume_align[%c0] : memref<2xf32>, vector<2xf32>
+  %5 = math.absf %4 : vector<2xf32>
+  vector.store %5, %assume_align_0[%c0] : memref<2xf32, strided<[1], offset: ?>>, vector<2xf32>
+  return
+}
+
+// -----// IR Dump After FoldMemRefAliasOpsPass (fold-memref-alias-ops) //----- //
+func.func @multiple_results_dispatch_0_elementwise_2_f32() {
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+  %1 = arith.index_castui %0 : i32 to index
+  %2 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32>
+  %assume_align = memref.assume_alignment %2, 64 : memref<2xf32>
+  %3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%1) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>>
+  %assume_align_0 = memref.assume_alignment %3, 64 : memref<2xf32, strided<[1], offset: ?>>
+  %4 = vector.load %assume_align[%c0] : memref<2xf32>, vector<2xf32>
+  %5 = math.absf %4 : vector<2xf32>
+  vector.store %5, %assume_align_0[%c0] : memref<2xf32, strided<[1], offset: ?>>, vector<2xf32>
+  return
+}
+
+// -----// IR Dump After IREEExpandStridedMetadataPass (iree-codegen-expand-strided-metadata) //----- //
+func.func @multiple_results_dispatch_0_elementwise_2_f32() {
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+  %1 = arith.index_castui %0 : i32 to index
+  %2 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32>
+  %assume_align = memref.assume_alignment %2, 64 : memref<2xf32>
+  %3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%1) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>>
+  %assume_align_0 = memref.assume_alignment %3, 64 : memref<2xf32, strided<[1], offset: ?>>
+  %4 = vector.load %assume_align[%c0] : memref<2xf32>, vector<2xf32>
+  %5 = math.absf %4 : vector<2xf32>
+  vector.store %5, %assume_align_0[%c0] : memref<2xf32, strided<[1], offset: ?>>, vector<2xf32>
+  return
+}
+
+// -----// IR Dump After CleanupBufferAllocViewPass (iree-codegen-cleanup-buffer-alloc-view) //----- //
+func.func @multiple_results_dispatch_0_elementwise_2_f32() {
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+  %1 = arith.index_castui %0 : i32 to index
+  %2 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32>
+  %assume_align = memref.assume_alignment %2, 64 : memref<2xf32>
+  %3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%1) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>>
+  %assume_align_0 = memref.assume_alignment %3, 64 : memref<2xf32, strided<[1], offset: ?>>
+  %4 = vector.load %assume_align[%c0] : memref<2xf32>, vector<2xf32>
+  %5 = math.absf %4 : vector<2xf32>
+  vector.store %5, %assume_align_0[%c0] : memref<2xf32, strided<[1], offset: ?>>, vector<2xf32>
+  return
+}
+
+// -----// IR Dump After LLVMCPUCheckIRBeforeLLVMConversionPass (iree-llvmcpu-check-ir-before-llvm-conversion) //----- //
+func.func @multiple_results_dispatch_0_elementwise_2_f32() {
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+  %1 = arith.index_castui %0 : i32 to index
+  %2 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32>
+  %assume_align = memref.assume_alignment %2, 64 : memref<2xf32>
+  %3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%1) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>>
+  %assume_align_0 = memref.assume_alignment %3, 64 : memref<2xf32, strided<[1], offset: ?>>
+  %4 = vector.load %assume_align[%c0] : memref<2xf32>, vector<2xf32>
+  %5 = math.absf %4 : vector<2xf32>
+  vector.store %5, %assume_align_0[%c0] : memref<2xf32, strided<[1], offset: ?>>, vector<2xf32>
+  return
+}
+
+// -----// IR Dump After SCFToControlFlowPass (convert-scf-to-cf) //----- //
+func.func @multiple_results_dispatch_0_elementwise_2_f32() {
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+  %1 = arith.index_castui %0 : i32 to index
+  %2 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32>
+  %assume_align = memref.assume_alignment %2, 64 : memref<2xf32>
+  %3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%1) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>>
+  %assume_align_0 = memref.assume_alignment %3, 64 : memref<2xf32, strided<[1], offset: ?>>
+  %4 = vector.load %assume_align[%c0] : memref<2xf32>, vector<2xf32>
+  %5 = math.absf %4 : vector<2xf32>
+  vector.store %5, %assume_align_0[%c0] : memref<2xf32, strided<[1], offset: ?>>, vector<2xf32>
+  return
+}
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+func.func @multiple_results_dispatch_0_elementwise_2_f32() {
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+  %1 = arith.index_castui %0 : i32 to index
+  %2 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32>
+  %assume_align = memref.assume_alignment %2, 64 : memref<2xf32>
+  %3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%1) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>>
+  %assume_align_0 = memref.assume_alignment %3, 64 : memref<2xf32, strided<[1], offset: ?>>
+  %4 = vector.load %assume_align[%c0] : memref<2xf32>, vector<2xf32>
+  %5 = math.absf %4 : vector<2xf32>
+  vector.store %5, %assume_align_0[%c0] : memref<2xf32, strided<[1], offset: ?>>, vector<2xf32>
+  return
+}
+
+// -----// IR Dump After CSE (cse) //----- //
+func.func @multiple_results_dispatch_0_elementwise_2_f32() {
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+  %1 = arith.index_castui %0 : i32 to index
+  %2 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32>
+  %assume_align = memref.assume_alignment %2, 64 : memref<2xf32>
+  %3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%1) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>>
+  %assume_align_0 = memref.assume_alignment %3, 64 : memref<2xf32, strided<[1], offset: ?>>
+  %4 = vector.load %assume_align[%c0] : memref<2xf32>, vector<2xf32>
+  %5 = math.absf %4 : vector<2xf32>
+  vector.store %5, %assume_align_0[%c0] : memref<2xf32, strided<[1], offset: ?>>, vector<2xf32>
+  return
+}
+
+// -----// IR Dump After FoldMemRefAliasOpsPass (fold-memref-alias-ops) //----- //
+func.func @multiple_results_dispatch_0_elementwise_2_f32() {
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+  %1 = arith.index_castui %0 : i32 to index
+  %2 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32>
+  %assume_align = memref.assume_alignment %2, 64 : memref<2xf32>
+  %3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%1) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>>
+  %assume_align_0 = memref.assume_alignment %3, 64 : memref<2xf32, strided<[1], offset: ?>>
+  %4 = vector.load %assume_align[%c0] : memref<2xf32>, vector<2xf32>
+  %5 = math.absf %4 : vector<2xf32>
+  vector.store %5, %assume_align_0[%c0] : memref<2xf32, strided<[1], offset: ?>>, vector<2xf32>
+  return
+}
+
+// -----// IR Dump After IREECodegenAffineExpandIndexOpsPass (iree-codegen-affine-expand-index-ops) //----- //
+func.func @multiple_results_dispatch_0_elementwise_2_f32() {
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+  %1 = arith.index_castui %0 : i32 to index
+  %2 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32>
+  %assume_align = memref.assume_alignment %2, 64 : memref<2xf32>
+  %3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%1) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>>
+  %assume_align_0 = memref.assume_alignment %3, 64 : memref<2xf32, strided<[1], offset: ?>>
+  %4 = vector.load %assume_align[%c0] : memref<2xf32>, vector<2xf32>
+  %5 = math.absf %4 : vector<2xf32>
+  vector.store %5, %assume_align_0[%c0] : memref<2xf32, strided<[1], offset: ?>>, vector<2xf32>
+  return
+}
+
+// -----// IR Dump After ArithExpandOpsPass (arith-expand) //----- //
+func.func @multiple_results_dispatch_0_elementwise_2_f32() {
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+  %1 = arith.index_castui %0 : i32 to index
+  %2 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32>
+  %assume_align = memref.assume_alignment %2, 64 : memref<2xf32>
+  %3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%1) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>>
+  %assume_align_0 = memref.assume_alignment %3, 64 : memref<2xf32, strided<[1], offset: ?>>
+  %4 = vector.load %assume_align[%c0] : memref<2xf32>, vector<2xf32>
+  %5 = math.absf %4 : vector<2xf32>
+  vector.store %5, %assume_align_0[%c0] : memref<2xf32, strided<[1], offset: ?>>, vector<2xf32>
+  return
+}
+
+// -----// IR Dump After EmulateNarrowTypePass (iree-codegen-emulate-narrow-type) //----- //
+func.func @multiple_results_dispatch_0_elementwise_2_f32() {
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+  %1 = arith.index_castui %0 : i32 to index
+  %2 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32>
+  %assume_align = memref.assume_alignment %2, 64 : memref<2xf32>
+  %3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%1) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>>
+  %assume_align_0 = memref.assume_alignment %3, 64 : memref<2xf32, strided<[1], offset: ?>>
+  %4 = vector.load %assume_align[%c0] : memref<2xf32>, vector<2xf32>
+  %5 = math.absf %4 : vector<2xf32>
+  vector.store %5, %assume_align_0[%c0] : memref<2xf32, strided<[1], offset: ?>>, vector<2xf32>
+  return
+}
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+func.func @multiple_results_dispatch_0_elementwise_2_f32() {
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+  %1 = arith.index_castui %0 : i32 to index
+  %2 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32>
+  %assume_align = memref.assume_alignment %2, 64 : memref<2xf32>
+  %3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%1) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>>
+  %assume_align_0 = memref.assume_alignment %3, 64 : memref<2xf32, strided<[1], offset: ?>>
+  %4 = vector.load %assume_align[%c0] : memref<2xf32>, vector<2xf32>
+  %5 = math.absf %4 : vector<2xf32>
+  vector.store %5, %assume_align_0[%c0] : memref<2xf32, strided<[1], offset: ?>>, vector<2xf32>
+  return
+}
+
+// -----// IR Dump After CSE (cse) //----- //
+func.func @multiple_results_dispatch_0_elementwise_2_f32() {
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
+  %1 = arith.index_castui %0 : i32 to index
+  %2 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32>
+  %assume_align = memref.assume_alignment %2, 64 : memref<2xf32>
+  %3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%1) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>>
+  %assume_align_0 = memref.assume_alignment %3, 64 : memref<2xf32, strided<[1], offset: ?>>
+  %4 = vector.load %assume_align[%c0] : memref<2xf32>, vector<2xf32>
+  %5 = math.absf %4 : vector<2xf32>
+  vector.store %5, %assume_align_0[%c0] : memref<2xf32, strided<[1], offset: ?>>, vector<2xf32>
+  return
+}
+
+// -----// IR Dump After ConvertToLLVMPass (iree-convert-to-llvm) //----- //
+module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", llvm.target_triple = "arm64-unknown-unknown-eabi-elf"} {
+  llvm.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}) -> i32 {
+    %0 = llvm.mlir.constant(0 : i32) : i32
+    %1 = llvm.mlir.constant(8 : i64) : i64
+    %2 = llvm.mlir.constant(32 : i64) : i64
+    %3 = llvm.mlir.constant(64 : index) : i64
+    %4 = llvm.mlir.constant(true) : i1
+    %5 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
+    %6 = llvm.extractvalue %5[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+    %7 = llvm.load %6 : !llvm.ptr -> i32
+    %8 = llvm.zext %7 : i32 to i64
+    %9 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
+    %10 = llvm.extractvalue %9[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+    %11 = llvm.load %10 : !llvm.ptr -> !llvm.ptr
+    llvm.intr.assume %4 ["align"(%11, %3 : !llvm.ptr, i64)] : i1
+    %12 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
+    %13 = llvm.extractvalue %12[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+    %14 = llvm.getelementptr %13[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
+    %15 = llvm.load %14 : !llvm.ptr -> !llvm.ptr
+    %16 = llvm.mul %8, %1 : i64
+    %17 = llvm.udiv %16, %2 : i64
+    %18 = llvm.getelementptr %15[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+    llvm.intr.assume %4 ["align"(%18, %3 : !llvm.ptr, i64)] : i1
+    %19 = llvm.load %11 {alignment = 4 : i64} : !llvm.ptr -> vector<2xf32>
+    %20 = llvm.intr.fabs(%19) : (vector<2xf32>) -> vector<2xf32>
+    %21 = llvm.getelementptr %15[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+    llvm.store %20, %21 {alignment = 4 : i64} : vector<2xf32>, !llvm.ptr
+    llvm.return %0 : i32
+  }
+}
+
+// -----// IR Dump After ReconcileUnrealizedCastsPass (reconcile-unrealized-casts) //----- //
+module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", llvm.target_triple = "arm64-unknown-unknown-eabi-elf"} {
+  llvm.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}) -> i32 {
+    %0 = llvm.mlir.constant(0 : i32) : i32
+    %1 = llvm.mlir.constant(8 : i64) : i64
+    %2 = llvm.mlir.constant(32 : i64) : i64
+    %3 = llvm.mlir.constant(64 : index) : i64
+    %4 = llvm.mlir.constant(true) : i1
+    %5 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
+    %6 = llvm.extractvalue %5[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+    %7 = llvm.load %6 : !llvm.ptr -> i32
+    %8 = llvm.zext %7 : i32 to i64
+    %9 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
+    %10 = llvm.extractvalue %9[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+    %11 = llvm.load %10 : !llvm.ptr -> !llvm.ptr
+    llvm.intr.assume %4 ["align"(%11, %3 : !llvm.ptr, i64)] : i1
+    %12 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
+    %13 = llvm.extractvalue %12[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+    %14 = llvm.getelementptr %13[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
+    %15 = llvm.load %14 : !llvm.ptr -> !llvm.ptr
+    %16 = llvm.mul %8, %1 : i64
+    %17 = llvm.udiv %16, %2 : i64
+    %18 = llvm.getelementptr %15[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+    llvm.intr.assume %4 ["align"(%18, %3 : !llvm.ptr, i64)] : i1
+    %19 = llvm.load %11 {alignment = 4 : i64} : !llvm.ptr -> vector<2xf32>
+    %20 = llvm.intr.fabs(%19) : (vector<2xf32>) -> vector<2xf32>
+    %21 = llvm.getelementptr %15[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+    llvm.store %20, %21 {alignment = 4 : i64} : vector<2xf32>, !llvm.ptr
+    llvm.return %0 : i32
+  }
+}
+
+// -----// IR Dump After LLVMCPUSynchronizeSymbolVisibilityPass (iree-llvmcpu-synchronize-symbol-visibility) //----- //
+module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", llvm.target_triple = "arm64-unknown-unknown-eabi-elf"} {
+  llvm.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}) -> i32 {
+    %0 = llvm.mlir.constant(0 : i32) : i32
+    %1 = llvm.mlir.constant(8 : i64) : i64
+    %2 = llvm.mlir.constant(32 : i64) : i64
+    %3 = llvm.mlir.constant(64 : index) : i64
+    %4 = llvm.mlir.constant(true) : i1
+    %5 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
+    %6 = llvm.extractvalue %5[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+    %7 = llvm.load %6 : !llvm.ptr -> i32
+    %8 = llvm.zext %7 : i32 to i64
+    %9 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
+    %10 = llvm.extractvalue %9[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+    %11 = llvm.load %10 : !llvm.ptr -> !llvm.ptr
+    llvm.intr.assume %4 ["align"(%11, %3 : !llvm.ptr, i64)] : i1
+    %12 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
+    %13 = llvm.extractvalue %12[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+    %14 = llvm.getelementptr %13[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
+    %15 = llvm.load %14 : !llvm.ptr -> !llvm.ptr
+    %16 = llvm.mul %8, %1 : i64
+    %17 = llvm.udiv %16, %2 : i64
+    %18 = llvm.getelementptr %15[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+    llvm.intr.assume %4 ["align"(%18, %3 : !llvm.ptr, i64)] : i1
+    %19 = llvm.load %11 {alignment = 4 : i64} : !llvm.ptr -> vector<2xf32>
+    %20 = llvm.intr.fabs(%19) : (vector<2xf32>) -> vector<2xf32>
+    %21 = llvm.getelementptr %15[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+    llvm.store %20, %21 {alignment = 4 : i64} : vector<2xf32>, !llvm.ptr
+    llvm.return %0 : i32
+  }
+}
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", llvm.target_triple = "arm64-unknown-unknown-eabi-elf"} {
+  llvm.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}) -> i32 {
+    %0 = llvm.mlir.constant(0 : i32) : i32
+    %1 = llvm.mlir.constant(8 : i64) : i64
+    %2 = llvm.mlir.constant(32 : i64) : i64
+    %3 = llvm.mlir.constant(64 : index) : i64
+    %4 = llvm.mlir.constant(true) : i1
+    %5 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
+    %6 = llvm.extractvalue %5[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+    %7 = llvm.load %6 : !llvm.ptr -> i32
+    %8 = llvm.zext %7 : i32 to i64
+    %9 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
+    %10 = llvm.extractvalue %9[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+    %11 = llvm.load %10 : !llvm.ptr -> !llvm.ptr
+    llvm.intr.assume %4 ["align"(%11, %3 : !llvm.ptr, i64)] : i1
+    %12 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
+    %13 = llvm.extractvalue %12[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+    %14 = llvm.getelementptr %13[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
+    %15 = llvm.load %14 : !llvm.ptr -> !llvm.ptr
+    %16 = llvm.mul %8, %1 : i64
+    %17 = llvm.udiv %16, %2 : i64
+    %18 = llvm.getelementptr %15[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+    llvm.intr.assume %4 ["align"(%18, %3 : !llvm.ptr, i64)] : i1
+    %19 = llvm.load %11 {alignment = 4 : i64} : !llvm.ptr -> vector<2xf32>
+    %20 = llvm.intr.fabs(%19) : (vector<2xf32>) -> vector<2xf32>
+    %21 = llvm.getelementptr %15[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+    llvm.store %20, %21 {alignment = 4 : i64} : vector<2xf32>, !llvm.ptr
+    llvm.return %0 : i32
+  }
+}
+
+// -----// IR Dump After CSE (cse) //----- //
+module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", llvm.target_triple = "arm64-unknown-unknown-eabi-elf"} {
+  llvm.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}) -> i32 {
+    %0 = llvm.mlir.constant(0 : i32) : i32
+    %1 = llvm.mlir.constant(8 : i64) : i64
+    %2 = llvm.mlir.constant(32 : i64) : i64
+    %3 = llvm.mlir.constant(64 : index) : i64
+    %4 = llvm.mlir.constant(true) : i1
+    %5 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
+    %6 = llvm.extractvalue %5[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+    %7 = llvm.load %6 : !llvm.ptr -> i32
+    %8 = llvm.zext %7 : i32 to i64
+    %9 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+    %10 = llvm.load %9 : !llvm.ptr -> !llvm.ptr
+    llvm.intr.assume %4 ["align"(%10, %3 : !llvm.ptr, i64)] : i1
+    %11 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
+    %12 = llvm.extractvalue %11[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+    %13 = llvm.getelementptr %12[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
+    %14 = llvm.load %13 : !llvm.ptr -> !llvm.ptr
+    %15 = llvm.mul %8, %1 : i64
+    %16 = llvm.udiv %15, %2 : i64
+    %17 = llvm.getelementptr %14[%16] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+    llvm.intr.assume %4 ["align"(%17, %3 : !llvm.ptr, i64)] : i1
+    %18 = llvm.load %10 {alignment = 4 : i64} : !llvm.ptr -> vector<2xf32>
+    %19 = llvm.intr.fabs(%18) : (vector<2xf32>) -> vector<2xf32>
+    llvm.store %19, %17 {alignment = 4 : i64} : vector<2xf32>, !llvm.ptr
+    llvm.return %0 : i32
+  }
+}
+
+// -----// IR Dump After AddFastMathFlagsPass (iree-codegen-add-fast-math-flags) //----- //
+llvm.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}) -> i32 {
+  %0 = llvm.mlir.constant(0 : i32) : i32
+  %1 = llvm.mlir.constant(8 : i64) : i64
+  %2 = llvm.mlir.constant(32 : i64) : i64
+  %3 = llvm.mlir.constant(64 : index) : i64
+  %4 = llvm.mlir.constant(true) : i1
+  %5 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
+  %6 = llvm.extractvalue %5[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+  %7 = llvm.load %6 : !llvm.ptr -> i32
+  %8 = llvm.zext %7 : i32 to i64
+  %9 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+  %10 = llvm.load %9 : !llvm.ptr -> !llvm.ptr
+  llvm.intr.assume %4 ["align"(%10, %3 : !llvm.ptr, i64)] : i1
+  %11 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
+  %12 = llvm.extractvalue %11[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+  %13 = llvm.getelementptr %12[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
+  %14 = llvm.load %13 : !llvm.ptr -> !llvm.ptr
+  %15 = llvm.mul %8, %1 : i64
+  %16 = llvm.udiv %15, %2 : i64
+  %17 = llvm.getelementptr %14[%16] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+  llvm.intr.assume %4 ["align"(%17, %3 : !llvm.ptr, i64)] : i1
+  %18 = llvm.load %10 {alignment = 4 : i64} : !llvm.ptr -> vector<2xf32>
+  %19 = llvm.intr.fabs(%18) : (vector<2xf32>) -> vector<2xf32>
+  llvm.store %19, %17 {alignment = 4 : i64} : vector<2xf32>, !llvm.ptr
+  llvm.return %0 : i32
+}
+
+// -----// IR Dump After TranslateTargetExecutableVariantsPass (iree-hal-translate-target-executable-variants) //----- //
+hal.executable.variant public @embedded_elf_arm_64 target(<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>) {
+  hal.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 ordinal(0) layout(#hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) count(%arg0: !hal.device) -> (index, index, index) {
+    %c1 = arith.constant 1 : index
+    %c1_0 = arith.constant 1 : index
+    %c1_1 = arith.constant 1 : index
+    hal.return %c1, %c1_0, %c1_1 : index, index, index
+  } attributes {workgroup_size = [1 : index, 1 : index, 1 : index]}
+  builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", llvm.target_triple = "arm64-unknown-unknown-eabi-elf"} {
+    llvm.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}) -> i32 {
+      %0 = llvm.mlir.constant(0 : i32) : i32
+      %1 = llvm.mlir.constant(8 : i64) : i64
+      %2 = llvm.mlir.constant(32 : i64) : i64
+      %3 = llvm.mlir.constant(64 : index) : i64
+      %4 = llvm.mlir.constant(true) : i1
+      %5 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
+      %6 = llvm.extractvalue %5[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+      %7 = llvm.load %6 : !llvm.ptr -> i32
+      %8 = llvm.zext %7 : i32 to i64
+      %9 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+      %10 = llvm.load %9 : !llvm.ptr -> !llvm.ptr
+      llvm.intr.assume %4 ["align"(%10, %3 : !llvm.ptr, i64)] : i1
+      %11 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
+      %12 = llvm.extractvalue %11[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+      %13 = llvm.getelementptr %12[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
+      %14 = llvm.load %13 : !llvm.ptr -> !llvm.ptr
+      %15 = llvm.mul %8, %1 : i64
+      %16 = llvm.udiv %15, %2 : i64
+      %17 = llvm.getelementptr %14[%16] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      llvm.intr.assume %4 ["align"(%17, %3 : !llvm.ptr, i64)] : i1
+      %18 = llvm.load %10 {alignment = 4 : i64} : !llvm.ptr -> vector<2xf32>
+      %19 = llvm.intr.fabs(%18) : (vector<2xf32>) -> vector<2xf32>
+      llvm.store %19, %17 {alignment = 4 : i64} : vector<2xf32>, !llvm.ptr
+      llvm.return %0 : i32
+    }
+  }
+}
+
+// -----// IR Dump After TranslateAllExecutablesPass (iree-hal-translate-all-executables) //----- //
+hal.executable private @multiple_results_dispatch_0 {
+  hal.executable.variant public @embedded_elf_arm_64 target(<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>) {
+    hal.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 ordinal(0) layout(#hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) count(%arg0: !hal.device) -> (index, index, index) {
+      %c1 = arith.constant 1 : index
+      %c1_0 = arith.constant 1 : index
+      %c1_1 = arith.constant 1 : index
+      hal.return %c1, %c1_0, %c1_1 : index, index, index
+    } attributes {workgroup_size = [1 : index, 1 : index, 1 : index]}
+    builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", llvm.target_triple = "arm64-unknown-unknown-eabi-elf"} {
+      llvm.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}) -> i32 {
+        %0 = llvm.mlir.constant(0 : i32) : i32
+        %1 = llvm.mlir.constant(8 : i64) : i64
+        %2 = llvm.mlir.constant(32 : i64) : i64
+        %3 = llvm.mlir.constant(64 : index) : i64
+        %4 = llvm.mlir.constant(true) : i1
+        %5 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
+        %6 = llvm.extractvalue %5[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+        %7 = llvm.load %6 : !llvm.ptr -> i32
+        %8 = llvm.zext %7 : i32 to i64
+        %9 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+        %10 = llvm.load %9 : !llvm.ptr -> !llvm.ptr
+        llvm.intr.assume %4 ["align"(%10, %3 : !llvm.ptr, i64)] : i1
+        %11 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
+        %12 = llvm.extractvalue %11[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+        %13 = llvm.getelementptr %12[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
+        %14 = llvm.load %13 : !llvm.ptr -> !llvm.ptr
+        %15 = llvm.mul %8, %1 : i64
+        %16 = llvm.udiv %15, %2 : i64
+        %17 = llvm.getelementptr %14[%16] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+        llvm.intr.assume %4 ["align"(%17, %3 : !llvm.ptr, i64)] : i1
+        %18 = llvm.load %10 {alignment = 4 : i64} : !llvm.ptr -> vector<2xf32>
+        %19 = llvm.intr.fabs(%18) : (vector<2xf32>) -> vector<2xf32>
+        llvm.store %19, %17 {alignment = 4 : i64} : vector<2xf32>, !llvm.ptr
+        llvm.return %0 : i32
+      }
+    }
+  }
+}
+
+// -----// IR Dump After ConvertToHALPass (iree-hal-conversion) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module {
+  util.global private @__device_0 = #device_target_local
+  hal.executable private @multiple_results_dispatch_0 {
+    hal.executable.variant public @embedded_elf_arm_64 target(#executable_target_embedded_elf_arm_64) {
+      hal.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 ordinal(0) layout(#pipeline_layout) attributes {workgroup_size = [1 : index, 1 : index, 1 : index]}
+      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", llvm.target_triple = "arm64-unknown-unknown-eabi-elf"} {
+        llvm.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}) -> i32 {
+          %0 = llvm.mlir.constant(0 : i32) : i32
+          %1 = llvm.mlir.constant(8 : i64) : i64
+          %2 = llvm.mlir.constant(32 : i64) : i64
+          %3 = llvm.mlir.constant(64 : index) : i64
+          %4 = llvm.mlir.constant(true) : i1
+          %5 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
+          %6 = llvm.extractvalue %5[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+          %7 = llvm.load %6 : !llvm.ptr -> i32
+          %8 = llvm.zext %7 : i32 to i64
+          %9 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+          %10 = llvm.load %9 : !llvm.ptr -> !llvm.ptr
+          llvm.intr.assume %4 ["align"(%10, %3 : !llvm.ptr, i64)] : i1
+          %11 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
+          %12 = llvm.extractvalue %11[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+          %13 = llvm.getelementptr %12[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
+          %14 = llvm.load %13 : !llvm.ptr -> !llvm.ptr
+          %15 = llvm.mul %8, %1 : i64
+          %16 = llvm.udiv %15, %2 : i64
+          %17 = llvm.getelementptr %14[%16] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+          llvm.intr.assume %4 ["align"(%17, %3 : !llvm.ptr, i64)] : i1
+          %18 = llvm.load %10 {alignment = 4 : i64} : !llvm.ptr -> vector<2xf32>
+          %19 = llvm.intr.fabs(%18) : (vector<2xf32>) -> vector<2xf32>
+          llvm.store %19, %17 {alignment = 4 : i64} : vector<2xf32>, !llvm.ptr
+          llvm.return %0 : i32
+        }
+      }
+    }
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c64_i32 = arith.constant 64 : i32
+    %c0_i32 = arith.constant 0 : i32
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c64 = arith.constant 64 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
+    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
+    %__device_0_1 = util.global.load immutable @__device_0 : !hal.device
+    %allocator_2 = hal.device.allocator<%__device_0_1 : !hal.device> : !hal.allocator
+    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator_2 : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    %memory_types, %buffer_usage = hal.allocator.resolve_memory_properties for(#hal.device.affinity<@__device_0>) lifetime(external) : i32, i32
+    %__device_0_3 = util.global.load immutable @__device_0 : !hal.device
+    %c-1_i64 = arith.constant -1 : i64
+    %0 = util.null : !hal.fence
+    %fence = hal.fence.create device(%__device_0_3 : !hal.device) flags("None") : !hal.fence
+    %c0_i64 = arith.constant 0 : i64
+    %transient_buffer = hal.device.queue.alloca<%__device_0_3 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_types) usage(%buffer_usage) flags("None") : !hal.buffer{%c128}
+    %__device_0_4 = util.global.load immutable @__device_0 : !hal.device
+    %c-1_i64_5 = arith.constant -1 : i64
+    %c0_6 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c2_7 = arith.constant 2 : index
+    %1 = hal.device.memoize<%__device_0_4 : !hal.device> affinity(%c-1_i64_5) -> !hal.command_buffer {
+      %c3 = arith.constant 3 : index
+      %cmd = hal.command_buffer.create device(%__device_0_4 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64_5) bindings(%c3) : !hal.command_buffer
+      %2 = hal.command_buffer.device<%cmd : !hal.command_buffer> : !hal.device
+      %exe = hal.executable.lookup device(%2 : !hal.device) executable(@multiple_results_dispatch_0) : !hal.executable
+      %ordinal = hal.executable.export.ordinal target(@multiple_results_dispatch_0::@embedded_elf_arm_64::@multiple_results_dispatch_0_elementwise_2_f32) : index
+      %c1_20 = arith.constant 1 : index
+      %c1_21 = arith.constant 1 : index
+      %c1_22 = arith.constant 1 : index
+      hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c1_20, %c1_21, %c1_22]) constants([%c0_i32]) bindings([
+        (%c0_6 : index)[%c0, %c8], 
+        (%c2_7 : index)[%c0, %c128]
+      ]) flags("None")
+      %3 = hal.command_buffer.device<%cmd : !hal.command_buffer> : !hal.device
+      %exe_23 = hal.executable.lookup device(%3 : !hal.device) executable(@multiple_results_dispatch_0) : !hal.executable
+      %ordinal_24 = hal.executable.export.ordinal target(@multiple_results_dispatch_0::@embedded_elf_arm_64::@multiple_results_dispatch_0_elementwise_2_f32) : index
+      %c1_25 = arith.constant 1 : index
+      %c1_26 = arith.constant 1 : index
+      %c1_27 = arith.constant 1 : index
+      hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe_23 : !hal.executable)[%ordinal_24] workgroups([%c1_25, %c1_26, %c1_27]) constants([%c64_i32]) bindings([
+        (%c1 : index)[%c0, %c8], 
+        (%c2_7 : index)[%c0, %c128]
+      ]) flags("None")
+      hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
+      hal.command_buffer.finalize<%cmd : !hal.command_buffer>
+      hal.return %cmd : !hal.command_buffer
+    }
+    %fence_8 = hal.fence.create device(%__device_0_4 : !hal.device) flags("None") : !hal.fence
+    hal.device.queue.execute.indirect<%__device_0_4 : !hal.device> affinity(%c-1_i64_5) wait(%fence) signal(%fence_8) commands(%1) bindings([
+      (%buffer : !hal.buffer)[%c0_6, %c8], 
+      (%buffer_0 : !hal.buffer)[%c0_6, %c8], 
+      (%transient_buffer : !hal.buffer)[%c0_6, %c128]
+    ]) flags("None")
+    %c-1_i32 = arith.constant -1 : i32
+    %status = hal.fence.await until([%fence_8]) timeout_millis(%c-1_i32) flags("None") : i32
+    util.status.check_ok %status, "failed to wait on timepoint"
+    %buffer_9 = hal.buffer.subspan<%transient_buffer : !hal.buffer>[%c0, %c8] : !hal.buffer
+    %buffer_10 = hal.buffer.subspan<%transient_buffer : !hal.buffer>[%c64, %c8] : !hal.buffer
+    %dense_row_major_11 = hal.encoding_type<dense_row_major> : i32
+    %element_type_f32_12 = hal.element_type<f32> : i32
+    %c2_13 = arith.constant 2 : index
+    %c0_14 = arith.constant 0 : index
+    %view = hal.buffer_view.create buffer(%buffer_9 : !hal.buffer)[%c0_14, %c8] shape([%c2_13]) type(%element_type_f32_12) encoding(%dense_row_major_11) : !hal.buffer_view
+    %dense_row_major_15 = hal.encoding_type<dense_row_major> : i32
+    %element_type_f32_16 = hal.element_type<f32> : i32
+    %c2_17 = arith.constant 2 : index
+    %c0_18 = arith.constant 0 : index
+    %view_19 = hal.buffer_view.create buffer(%buffer_10 : !hal.buffer)[%c0_18, %c8] shape([%c2_17]) type(%element_type_f32_16) encoding(%dense_row_major_15) : !hal.buffer_view
+    util.return %view, %view_19 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After OutlineMemoizeRegionsPass (iree-hal-outline-memoize-regions) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module {
+  util.global private @__device_0 = #device_target_local
+  hal.executable private @multiple_results_dispatch_0 {
+    hal.executable.variant public @embedded_elf_arm_64 target(#executable_target_embedded_elf_arm_64) {
+      hal.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 ordinal(0) layout(#pipeline_layout) attributes {workgroup_size = [1 : index, 1 : index, 1 : index]}
+      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", llvm.target_triple = "arm64-unknown-unknown-eabi-elf"} {
+        llvm.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}) -> i32 {
+          %0 = llvm.mlir.constant(0 : i32) : i32
+          %1 = llvm.mlir.constant(8 : i64) : i64
+          %2 = llvm.mlir.constant(32 : i64) : i64
+          %3 = llvm.mlir.constant(64 : index) : i64
+          %4 = llvm.mlir.constant(true) : i1
+          %5 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
+          %6 = llvm.extractvalue %5[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+          %7 = llvm.load %6 : !llvm.ptr -> i32
+          %8 = llvm.zext %7 : i32 to i64
+          %9 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+          %10 = llvm.load %9 : !llvm.ptr -> !llvm.ptr
+          llvm.intr.assume %4 ["align"(%10, %3 : !llvm.ptr, i64)] : i1
+          %11 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
+          %12 = llvm.extractvalue %11[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+          %13 = llvm.getelementptr %12[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
+          %14 = llvm.load %13 : !llvm.ptr -> !llvm.ptr
+          %15 = llvm.mul %8, %1 : i64
+          %16 = llvm.udiv %15, %2 : i64
+          %17 = llvm.getelementptr %14[%16] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+          llvm.intr.assume %4 ["align"(%17, %3 : !llvm.ptr, i64)] : i1
+          %18 = llvm.load %10 {alignment = 4 : i64} : !llvm.ptr -> vector<2xf32>
+          %19 = llvm.intr.fabs(%18) : (vector<2xf32>) -> vector<2xf32>
+          llvm.store %19, %17 {alignment = 4 : i64} : vector<2xf32>, !llvm.ptr
+          llvm.return %0 : i32
+        }
+      }
+    }
+  }
+  util.func private @__multiple_results_memoize_apply(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} {
+    %c0_i32 = arith.constant 0 : i32
+    %c0 = arith.constant 0 : index
+    %c2 = arith.constant 2 : index
+    %c0_0 = arith.constant 0 : index
+    %c8 = arith.constant 8 : index
+    %c128 = arith.constant 128 : index
+    %c64_i32 = arith.constant 64 : i32
+    %c1 = arith.constant 1 : index
+    cf.br ^bb1
+  ^bb1:  // pred: ^bb0
+    %c3 = arith.constant 3 : index
+    %cmd = hal.command_buffer.create device(%arg0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%arg1) bindings(%c3) : !hal.command_buffer
+    %0 = hal.command_buffer.device<%cmd : !hal.command_buffer> : !hal.device
+    %exe = hal.executable.lookup device(%0 : !hal.device) executable(@multiple_results_dispatch_0) : !hal.executable
+    %ordinal = hal.executable.export.ordinal target(@multiple_results_dispatch_0::@embedded_elf_arm_64::@multiple_results_dispatch_0_elementwise_2_f32) : index
+    %c1_1 = arith.constant 1 : index
+    %c1_2 = arith.constant 1 : index
+    %c1_3 = arith.constant 1 : index
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c1_1, %c1_2, %c1_3]) constants([%c0_i32]) bindings([
+      (%c0 : index)[%c0_0, %c8], 
+      (%c2 : index)[%c0_0, %c128]
+    ]) flags("None")
+    %1 = hal.command_buffer.device<%cmd : !hal.command_buffer> : !hal.device
+    %exe_4 = hal.executable.lookup device(%1 : !hal.device) executable(@multiple_results_dispatch_0) : !hal.executable
+    %ordinal_5 = hal.executable.export.ordinal target(@multiple_results_dispatch_0::@embedded_elf_arm_64::@multiple_results_dispatch_0_elementwise_2_f32) : index
+    %c1_6 = arith.constant 1 : index
+    %c1_7 = arith.constant 1 : index
+    %c1_8 = arith.constant 1 : index
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe_4 : !hal.executable)[%ordinal_5] workgroups([%c1_6, %c1_7, %c1_8]) constants([%c64_i32]) bindings([
+      (%c1 : index)[%c0_0, %c8], 
+      (%c2 : index)[%c0_0, %c128]
+    ]) flags("None")
+    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
+    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
+    util.return %cmd : !hal.command_buffer
+  }
+  util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  util.initializer {
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %c-1_i64 = arith.constant -1 : i64
+    %0 = util.call @__multiple_results_memoize_apply(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer
+    util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    util.return
+  }
+  util.func private @__multiple_results_memoize_lookup(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer {
+    %__device_0 = util.global.load @__device_0 : !hal.device
+    %0 = util.cmp.eq %arg0, %__device_0 : !hal.device
+    %1 = scf.if %0 -> (!hal.command_buffer) {
+      %__multiple_results_memoize_result_0_device_0 = util.global.load @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+      scf.yield %__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    } else {
+      %2 = util.null : !hal.command_buffer
+      scf.yield %2 : !hal.command_buffer
+    }
+    util.return %1 : !hal.command_buffer
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c64_i32 = arith.constant 64 : i32
+    %c0_i32 = arith.constant 0 : i32
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c64 = arith.constant 64 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
+    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
+    %__device_0_1 = util.global.load immutable @__device_0 : !hal.device
+    %allocator_2 = hal.device.allocator<%__device_0_1 : !hal.device> : !hal.allocator
+    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator_2 : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    %memory_types, %buffer_usage = hal.allocator.resolve_memory_properties for(#hal.device.affinity<@__device_0>) lifetime(external) : i32, i32
+    %__device_0_3 = util.global.load immutable @__device_0 : !hal.device
+    %c-1_i64 = arith.constant -1 : i64
+    %0 = util.null : !hal.fence
+    %fence = hal.fence.create device(%__device_0_3 : !hal.device) flags("None") : !hal.fence
+    %c0_i64 = arith.constant 0 : i64
+    %transient_buffer = hal.device.queue.alloca<%__device_0_3 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_types) usage(%buffer_usage) flags("None") : !hal.buffer{%c128}
+    %__device_0_4 = util.global.load immutable @__device_0 : !hal.device
+    %c-1_i64_5 = arith.constant -1 : i64
+    %c0_6 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c2_7 = arith.constant 2 : index
+    %1 = util.call @__multiple_results_memoize_lookup(%__device_0_4, %c-1_i64_5) : (!hal.device, i64) -> !hal.command_buffer
+    %fence_8 = hal.fence.create device(%__device_0_4 : !hal.device) flags("None") : !hal.fence
+    hal.device.queue.execute.indirect<%__device_0_4 : !hal.device> affinity(%c-1_i64_5) wait(%fence) signal(%fence_8) commands(%1) bindings([
+      (%buffer : !hal.buffer)[%c0_6, %c8], 
+      (%buffer_0 : !hal.buffer)[%c0_6, %c8], 
+      (%transient_buffer : !hal.buffer)[%c0_6, %c128]
+    ]) flags("None")
+    %c-1_i32 = arith.constant -1 : i32
+    %status = hal.fence.await until([%fence_8]) timeout_millis(%c-1_i32) flags("None") : i32
+    util.status.check_ok %status, "failed to wait on timepoint"
+    %buffer_9 = hal.buffer.subspan<%transient_buffer : !hal.buffer>[%c0, %c8] : !hal.buffer
+    %buffer_10 = hal.buffer.subspan<%transient_buffer : !hal.buffer>[%c64, %c8] : !hal.buffer
+    %dense_row_major_11 = hal.encoding_type<dense_row_major> : i32
+    %element_type_f32_12 = hal.element_type<f32> : i32
+    %c2_13 = arith.constant 2 : index
+    %c0_14 = arith.constant 0 : index
+    %view = hal.buffer_view.create buffer(%buffer_9 : !hal.buffer)[%c0_14, %c8] shape([%c2_13]) type(%element_type_f32_12) encoding(%dense_row_major_11) : !hal.buffer_view
+    %dense_row_major_15 = hal.encoding_type<dense_row_major> : i32
+    %element_type_f32_16 = hal.element_type<f32> : i32
+    %c2_17 = arith.constant 2 : index
+    %c0_18 = arith.constant 0 : index
+    %view_19 = hal.buffer_view.create buffer(%buffer_10 : !hal.buffer)[%c0_18, %c8] shape([%c2_17]) type(%element_type_f32_16) encoding(%dense_row_major_15) : !hal.buffer_view
+    util.return %view, %view_19 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+util.initializer {
+  %c-1_i64 = arith.constant -1 : i64
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %0 = util.call @__multiple_results_memoize_apply(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer
+  util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  util.return
+}
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+util.func private @__multiple_results_memoize_lookup(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer {
+  %0 = util.null : !hal.command_buffer
+  %__device_0 = util.global.load @__device_0 : !hal.device
+  %1 = util.cmp.eq %arg0, %__device_0 : !hal.device
+  %2 = scf.if %1 -> (!hal.command_buffer) {
+    %__multiple_results_memoize_result_0_device_0 = util.global.load @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    scf.yield %__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  } else {
+    scf.yield %0 : !hal.command_buffer
+  }
+  util.return %2 : !hal.command_buffer
+}
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+util.func private @__multiple_results_memoize_apply(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} {
+  %c3 = arith.constant 3 : index
+  %c0_i32 = arith.constant 0 : i32
+  %c0 = arith.constant 0 : index
+  %c2 = arith.constant 2 : index
+  %c8 = arith.constant 8 : index
+  %c128 = arith.constant 128 : index
+  %c64_i32 = arith.constant 64 : i32
+  %c1 = arith.constant 1 : index
+  %cmd = hal.command_buffer.create device(%arg0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%arg1) bindings(%c3) : !hal.command_buffer
+  %exe = hal.executable.lookup device(%arg0 : !hal.device) executable(@multiple_results_dispatch_0) : !hal.executable
+  %ordinal = hal.executable.export.ordinal target(@multiple_results_dispatch_0::@embedded_elf_arm_64::@multiple_results_dispatch_0_elementwise_2_f32) : index
+  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([
+    (%c0 : index)[%c0, %c8], 
+    (%c2 : index)[%c0, %c128]
+  ]) flags("None")
+  %exe_0 = hal.executable.lookup device(%arg0 : !hal.device) executable(@multiple_results_dispatch_0) : !hal.executable
+  %ordinal_1 = hal.executable.export.ordinal target(@multiple_results_dispatch_0::@embedded_elf_arm_64::@multiple_results_dispatch_0_elementwise_2_f32) : index
+  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe_0 : !hal.executable)[%ordinal_1] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([
+    (%c1 : index)[%c0, %c8], 
+    (%c2 : index)[%c0, %c128]
+  ]) flags("None")
+  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
+  hal.command_buffer.finalize<%cmd : !hal.command_buffer>
+  util.return %cmd : !hal.command_buffer
+}
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c-1_i32 = arith.constant -1 : i32
+  %c0_i64 = arith.constant 0 : i64
+  %0 = util.null : !hal.fence
+  %c-1_i64 = arith.constant -1 : i64
+  %c0 = arith.constant 0 : index
+  %c128 = arith.constant 128 : index
+  %c64 = arith.constant 64 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
+  hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
+  %__device_0_1 = util.global.load immutable @__device_0 : !hal.device
+  %allocator_2 = hal.device.allocator<%__device_0_1 : !hal.device> : !hal.allocator
+  hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator_2 : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+  %memory_types, %buffer_usage = hal.allocator.resolve_memory_properties for(#hal.device.affinity<@__device_0>) lifetime(external) : i32, i32
+  %__device_0_3 = util.global.load immutable @__device_0 : !hal.device
+  %fence = hal.fence.create device(%__device_0_3 : !hal.device) flags("None") : !hal.fence
+  %transient_buffer = hal.device.queue.alloca<%__device_0_3 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_types) usage(%buffer_usage) flags("None") : !hal.buffer{%c128}
+  %__device_0_4 = util.global.load immutable @__device_0 : !hal.device
+  %1 = util.call @__multiple_results_memoize_lookup(%__device_0_4, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer
+  %fence_5 = hal.fence.create device(%__device_0_4 : !hal.device) flags("None") : !hal.fence
+  hal.device.queue.execute.indirect<%__device_0_4 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_5) commands(%1) bindings([
+    (%buffer : !hal.buffer)[%c0, %c8], 
+    (%buffer_0 : !hal.buffer)[%c0, %c8], 
+    (%transient_buffer : !hal.buffer)[%c0, %c128]
+  ]) flags("None")
+  %status = hal.fence.await until([%fence_5]) timeout_millis(%c-1_i32) flags("None") : i32
+  util.status.check_ok %status, "failed to wait on timepoint"
+  %dense_row_major_6 = hal.encoding_type<dense_row_major> : i32
+  %element_type_f32_7 = hal.element_type<f32> : i32
+  %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32_7) encoding(%dense_row_major_6) : !hal.buffer_view
+  %dense_row_major_8 = hal.encoding_type<dense_row_major> : i32
+  %element_type_f32_9 = hal.element_type<f32> : i32
+  %view_10 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32_9) encoding(%dense_row_major_8) : !hal.buffer_view
+  util.return %view, %view_10 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After CSE (cse) //----- //
+util.func private @__multiple_results_memoize_lookup(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer {
+  %0 = util.null : !hal.command_buffer
+  %__device_0 = util.global.load @__device_0 : !hal.device
+  %1 = util.cmp.eq %arg0, %__device_0 : !hal.device
+  %2 = scf.if %1 -> (!hal.command_buffer) {
+    %__multiple_results_memoize_result_0_device_0 = util.global.load @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    scf.yield %__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  } else {
+    scf.yield %0 : !hal.command_buffer
+  }
+  util.return %2 : !hal.command_buffer
+}
+
+// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- //
+util.func private @__multiple_results_memoize_lookup(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer {
+  %__device_0 = util.global.load @__device_0 : !hal.device
+  %0 = util.null : !hal.command_buffer
+  %1 = util.cmp.eq %arg0, %__device_0 : !hal.device
+  %2 = scf.if %1 -> (!hal.command_buffer) {
+    %__multiple_results_memoize_result_0_device_0 = util.global.load @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    scf.yield %__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  } else {
+    scf.yield %0 : !hal.command_buffer
+  }
+  util.return %2 : !hal.command_buffer
+}
+
+// -----// IR Dump After CSE (cse) //----- //
+util.initializer {
+  %c-1_i64 = arith.constant -1 : i64
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %0 = util.call @__multiple_results_memoize_apply(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer
+  util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  util.return
+}
+
+// -----// IR Dump After CSE (cse) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c-1_i32 = arith.constant -1 : i32
+  %c0_i64 = arith.constant 0 : i64
+  %0 = util.null : !hal.fence
+  %c-1_i64 = arith.constant -1 : i64
+  %c0 = arith.constant 0 : index
+  %c128 = arith.constant 128 : index
+  %c64 = arith.constant 64 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
+  hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
+  hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+  %memory_types, %buffer_usage = hal.allocator.resolve_memory_properties for(#hal.device.affinity<@__device_0>) lifetime(external) : i32, i32
+  %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+  %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_types) usage(%buffer_usage) flags("None") : !hal.buffer{%c128}
+  %1 = util.call @__multiple_results_memoize_lookup(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer
+  %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+  hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%1) bindings([
+    (%buffer : !hal.buffer)[%c0, %c8], 
+    (%buffer_0 : !hal.buffer)[%c0, %c8], 
+    (%transient_buffer : !hal.buffer)[%c0, %c128]
+  ]) flags("None")
+  %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32
+  util.status.check_ok %status, "failed to wait on timepoint"
+  %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+  %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+  util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After CSE (cse) //----- //
+util.func private @__multiple_results_memoize_apply(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} {
+  %c3 = arith.constant 3 : index
+  %c0_i32 = arith.constant 0 : i32
+  %c0 = arith.constant 0 : index
+  %c2 = arith.constant 2 : index
+  %c8 = arith.constant 8 : index
+  %c128 = arith.constant 128 : index
+  %c64_i32 = arith.constant 64 : i32
+  %c1 = arith.constant 1 : index
+  %cmd = hal.command_buffer.create device(%arg0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%arg1) bindings(%c3) : !hal.command_buffer
+  %exe = hal.executable.lookup device(%arg0 : !hal.device) executable(@multiple_results_dispatch_0) : !hal.executable
+  %ordinal = hal.executable.export.ordinal target(@multiple_results_dispatch_0::@embedded_elf_arm_64::@multiple_results_dispatch_0_elementwise_2_f32) : index
+  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([
+    (%c0 : index)[%c0, %c8], 
+    (%c2 : index)[%c0, %c128]
+  ]) flags("None")
+  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([
+    (%c1 : index)[%c0, %c8], 
+    (%c2 : index)[%c0, %c128]
+  ]) flags("None")
+  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
+  hal.command_buffer.finalize<%cmd : !hal.command_buffer>
+  util.return %cmd : !hal.command_buffer
+}
+
+// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- //
+util.func private @__multiple_results_memoize_lookup(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer {
+  %0 = util.null : !hal.command_buffer
+  %__device_0 = util.global.load @__device_0 : !hal.device
+  %1 = util.cmp.eq %arg0, %__device_0 : !hal.device
+  %2 = scf.if %1 -> (!hal.command_buffer) {
+    %__multiple_results_memoize_result_0_device_0 = util.global.load @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    scf.yield %__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  } else {
+    scf.yield %0 : !hal.command_buffer
+  }
+  util.return %2 : !hal.command_buffer
+}
+
+// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %c-1_i32 = arith.constant -1 : i32
+  %c0_i64 = arith.constant 0 : i64
+  %0 = util.null : !hal.fence
+  %c-1_i64 = arith.constant -1 : i64
+  %c0 = arith.constant 0 : index
+  %c128 = arith.constant 128 : index
+  %c64 = arith.constant 64 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
+  %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
+  hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
+  hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+  %memory_types, %buffer_usage = hal.allocator.resolve_memory_properties for(#hal.device.affinity<@__device_0>) lifetime(external) : i32, i32
+  %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+  %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_types) usage(%buffer_usage) flags("None") : !hal.buffer{%c128}
+  %1 = util.call @__multiple_results_memoize_lookup(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer
+  %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+  hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%1) bindings([
+    (%buffer : !hal.buffer)[%c0, %c8], 
+    (%buffer_0 : !hal.buffer)[%c0, %c8], 
+    (%transient_buffer : !hal.buffer)[%c0, %c128]
+  ]) flags("None")
+  %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32
+  util.status.check_ok %status, "failed to wait on timepoint"
+  %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+  %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+  util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- //
+util.initializer {
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %c-1_i64 = arith.constant -1 : i64
+  %0 = util.call @__multiple_results_memoize_apply(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer
+  util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  util.return
+}
+
+// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- //
+util.func private @__multiple_results_memoize_apply(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} {
+  %c3 = arith.constant 3 : index
+  %c0_i32 = arith.constant 0 : i32
+  %c0 = arith.constant 0 : index
+  %c2 = arith.constant 2 : index
+  %c8 = arith.constant 8 : index
+  %c128 = arith.constant 128 : index
+  %c64_i32 = arith.constant 64 : i32
+  %c1 = arith.constant 1 : index
+  %cmd = hal.command_buffer.create device(%arg0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%arg1) bindings(%c3) : !hal.command_buffer
+  %exe = hal.executable.lookup device(%arg0 : !hal.device) executable(@multiple_results_dispatch_0) : !hal.executable
+  %ordinal = hal.executable.export.ordinal target(@multiple_results_dispatch_0::@embedded_elf_arm_64::@multiple_results_dispatch_0_elementwise_2_f32) : index
+  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([
+    (%c0 : index)[%c0, %c8], 
+    (%c2 : index)[%c0, %c128]
+  ]) flags("None")
+  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([
+    (%c1 : index)[%c0, %c8], 
+    (%c2 : index)[%c0, %c128]
+  ]) flags("None")
+  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
+  hal.command_buffer.finalize<%cmd : !hal.command_buffer>
+  util.return %cmd : !hal.command_buffer
+}
+
+// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- //
+util.initializer {
+  %c-1_i64 = arith.constant -1 : i64
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %0 = util.call @__multiple_results_memoize_apply(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer
+  util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  util.return
+}
+
+// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c2 = arith.constant 2 : index
+  %c8 = arith.constant 8 : index
+  %c64 = arith.constant 64 : index
+  %c128 = arith.constant 128 : index
+  %c0 = arith.constant 0 : index
+  %c-1_i64 = arith.constant -1 : i64
+  %0 = util.null : !hal.fence
+  %c0_i64 = arith.constant 0 : i64
+  %c-1_i32 = arith.constant -1 : i32
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
+  %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
+  hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
+  hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+  %memory_types, %buffer_usage = hal.allocator.resolve_memory_properties for(#hal.device.affinity<@__device_0>) lifetime(external) : i32, i32
+  %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+  %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_types) usage(%buffer_usage) flags("None") : !hal.buffer{%c128}
+  %1 = util.call @__multiple_results_memoize_lookup(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer
+  %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+  hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%1) bindings([
+    (%buffer : !hal.buffer)[%c0, %c8], 
+    (%buffer_0 : !hal.buffer)[%c0, %c8], 
+    (%transient_buffer : !hal.buffer)[%c0, %c128]
+  ]) flags("None")
+  %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32
+  util.status.check_ok %status, "failed to wait on timepoint"
+  %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+  %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+  util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- //
+util.func private @__multiple_results_memoize_apply(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} {
+  %c3 = arith.constant 3 : index
+  %c0_i32 = arith.constant 0 : i32
+  %c0 = arith.constant 0 : index
+  %c2 = arith.constant 2 : index
+  %c8 = arith.constant 8 : index
+  %c128 = arith.constant 128 : index
+  %c64_i32 = arith.constant 64 : i32
+  %c1 = arith.constant 1 : index
+  %cmd = hal.command_buffer.create device(%arg0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%arg1) bindings(%c3) : !hal.command_buffer
+  %exe = hal.executable.lookup device(%arg0 : !hal.device) executable(@multiple_results_dispatch_0) : !hal.executable
+  %ordinal = hal.executable.export.ordinal target(@multiple_results_dispatch_0::@embedded_elf_arm_64::@multiple_results_dispatch_0_elementwise_2_f32) : index
+  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([
+    (%c0 : index)[%c0, %c8], 
+    (%c2 : index)[%c0, %c128]
+  ]) flags("None")
+  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([
+    (%c1 : index)[%c0, %c8], 
+    (%c2 : index)[%c0, %c128]
+  ]) flags("None")
+  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
+  hal.command_buffer.finalize<%cmd : !hal.command_buffer>
+  util.return %cmd : !hal.command_buffer
+}
+
+// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module {
+  util.global private @__device_0 = #device_target_local
+  hal.executable private @multiple_results_dispatch_0 {
+    hal.executable.variant public @embedded_elf_arm_64 target(#executable_target_embedded_elf_arm_64) {
+      hal.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 ordinal(0) layout(#pipeline_layout) attributes {workgroup_size = [1 : index, 1 : index, 1 : index]}
+      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", llvm.target_triple = "arm64-unknown-unknown-eabi-elf"} {
+        llvm.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}) -> i32 {
+          %0 = llvm.mlir.constant(0 : i32) : i32
+          %1 = llvm.mlir.constant(8 : i64) : i64
+          %2 = llvm.mlir.constant(32 : i64) : i64
+          %3 = llvm.mlir.constant(64 : index) : i64
+          %4 = llvm.mlir.constant(true) : i1
+          %5 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
+          %6 = llvm.extractvalue %5[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+          %7 = llvm.load %6 : !llvm.ptr -> i32
+          %8 = llvm.zext %7 : i32 to i64
+          %9 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+          %10 = llvm.load %9 : !llvm.ptr -> !llvm.ptr
+          llvm.intr.assume %4 ["align"(%10, %3 : !llvm.ptr, i64)] : i1
+          %11 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
+          %12 = llvm.extractvalue %11[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+          %13 = llvm.getelementptr %12[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
+          %14 = llvm.load %13 : !llvm.ptr -> !llvm.ptr
+          %15 = llvm.mul %8, %1 : i64
+          %16 = llvm.udiv %15, %2 : i64
+          %17 = llvm.getelementptr %14[%16] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+          llvm.intr.assume %4 ["align"(%17, %3 : !llvm.ptr, i64)] : i1
+          %18 = llvm.load %10 {alignment = 4 : i64} : !llvm.ptr -> vector<2xf32>
+          %19 = llvm.intr.fabs(%18) : (vector<2xf32>) -> vector<2xf32>
+          llvm.store %19, %17 {alignment = 4 : i64} : vector<2xf32>, !llvm.ptr
+          llvm.return %0 : i32
+        }
+      }
+    }
+  }
+  util.func private @__multiple_results_memoize_apply(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} {
+    %c3 = arith.constant 3 : index
+    %c0_i32 = arith.constant 0 : i32
+    %c0 = arith.constant 0 : index
+    %c2 = arith.constant 2 : index
+    %c8 = arith.constant 8 : index
+    %c128 = arith.constant 128 : index
+    %c64_i32 = arith.constant 64 : i32
+    %c1 = arith.constant 1 : index
+    %cmd = hal.command_buffer.create device(%arg0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%arg1) bindings(%c3) : !hal.command_buffer
+    %exe = hal.executable.lookup device(%arg0 : !hal.device) executable(@multiple_results_dispatch_0) : !hal.executable
+    %ordinal = hal.executable.export.ordinal target(@multiple_results_dispatch_0::@embedded_elf_arm_64::@multiple_results_dispatch_0_elementwise_2_f32) : index
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([
+      (%c0 : index)[%c0, %c8], 
+      (%c2 : index)[%c0, %c128]
+    ]) flags("None")
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([
+      (%c1 : index)[%c0, %c8], 
+      (%c2 : index)[%c0, %c128]
+    ]) flags("None")
+    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
+    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
+    util.return %cmd : !hal.command_buffer
+  }
+  util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  util.initializer {
+    %c-1_i64 = arith.constant -1 : i64
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %0 = util.call @__multiple_results_memoize_apply(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer
+    util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    util.return
+  }
+  util.func private @__multiple_results_memoize_lookup(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer {
+    %0 = util.null : !hal.command_buffer
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %1 = util.cmp.eq %arg0, %__device_0 : !hal.device
+    %2 = scf.if %1 -> (!hal.command_buffer) {
+      %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+      scf.yield %__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    } else {
+      scf.yield %0 : !hal.command_buffer
+    }
+    util.return %2 : !hal.command_buffer
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c2 = arith.constant 2 : index
+    %c8 = arith.constant 8 : index
+    %c64 = arith.constant 64 : index
+    %c128 = arith.constant 128 : index
+    %c0 = arith.constant 0 : index
+    %c-1_i64 = arith.constant -1 : i64
+    %0 = util.null : !hal.fence
+    %c0_i64 = arith.constant 0 : i64
+    %c-1_i32 = arith.constant -1 : i32
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
+    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
+    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
+    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    %memory_types, %buffer_usage = hal.allocator.resolve_memory_properties for(#hal.device.affinity<@__device_0>) lifetime(external) : i32, i32
+    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_types) usage(%buffer_usage) flags("None") : !hal.buffer{%c128}
+    %1 = util.call @__multiple_results_memoize_lookup(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer
+    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+    hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%1) bindings([
+      (%buffer : !hal.buffer)[%c0, %c8], 
+      (%buffer_0 : !hal.buffer)[%c0, %c8], 
+      (%transient_buffer : !hal.buffer)[%c0, %c128]
+    ]) flags("None")
+    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32
+    util.status.check_ok %status, "failed to wait on timepoint"
+    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+    %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+    util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module {
+  util.global private @__device_0 = #device_target_local
+  hal.executable private @multiple_results_dispatch_0 {
+    hal.executable.variant public @embedded_elf_arm_64 target(#executable_target_embedded_elf_arm_64) {
+      hal.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 ordinal(0) layout(#pipeline_layout) attributes {workgroup_size = [1 : index, 1 : index, 1 : index]}
+      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", llvm.target_triple = "arm64-unknown-unknown-eabi-elf"} {
+        llvm.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}) -> i32 {
+          %0 = llvm.mlir.constant(0 : i32) : i32
+          %1 = llvm.mlir.constant(8 : i64) : i64
+          %2 = llvm.mlir.constant(32 : i64) : i64
+          %3 = llvm.mlir.constant(64 : index) : i64
+          %4 = llvm.mlir.constant(true) : i1
+          %5 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
+          %6 = llvm.extractvalue %5[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+          %7 = llvm.load %6 : !llvm.ptr -> i32
+          %8 = llvm.zext %7 : i32 to i64
+          %9 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+          %10 = llvm.load %9 : !llvm.ptr -> !llvm.ptr
+          llvm.intr.assume %4 ["align"(%10, %3 : !llvm.ptr, i64)] : i1
+          %11 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
+          %12 = llvm.extractvalue %11[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+          %13 = llvm.getelementptr %12[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
+          %14 = llvm.load %13 : !llvm.ptr -> !llvm.ptr
+          %15 = llvm.mul %8, %1 : i64
+          %16 = llvm.udiv %15, %2 : i64
+          %17 = llvm.getelementptr %14[%16] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+          llvm.intr.assume %4 ["align"(%17, %3 : !llvm.ptr, i64)] : i1
+          %18 = llvm.load %10 {alignment = 4 : i64} : !llvm.ptr -> vector<2xf32>
+          %19 = llvm.intr.fabs(%18) : (vector<2xf32>) -> vector<2xf32>
+          llvm.store %19, %17 {alignment = 4 : i64} : vector<2xf32>, !llvm.ptr
+          llvm.return %0 : i32
+        }
+      }
+    }
+  }
+  util.func private @__multiple_results_memoize_apply(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} {
+    %c3 = arith.constant 3 : index
+    %c0_i32 = arith.constant 0 : i32
+    %c0 = arith.constant 0 : index
+    %c2 = arith.constant 2 : index
+    %c8 = arith.constant 8 : index
+    %c128 = arith.constant 128 : index
+    %c64_i32 = arith.constant 64 : i32
+    %c1 = arith.constant 1 : index
+    %cmd = hal.command_buffer.create device(%arg0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%arg1) bindings(%c3) : !hal.command_buffer
+    %exe = hal.executable.lookup device(%arg0 : !hal.device) executable(@multiple_results_dispatch_0) : !hal.executable
+    %ordinal = hal.executable.export.ordinal target(@multiple_results_dispatch_0::@embedded_elf_arm_64::@multiple_results_dispatch_0_elementwise_2_f32) : index
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([
+      (%c0 : index)[%c0, %c8], 
+      (%c2 : index)[%c0, %c128]
+    ]) flags("None")
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([
+      (%c1 : index)[%c0, %c8], 
+      (%c2 : index)[%c0, %c128]
+    ]) flags("None")
+    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
+    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
+    util.return %cmd : !hal.command_buffer
+  }
+  util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  util.initializer {
+    %c-1_i64 = arith.constant -1 : i64
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %0 = util.call @__multiple_results_memoize_apply(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer
+    util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    util.return
+  }
+  util.func private @__multiple_results_memoize_lookup(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer {
+    %0 = util.null : !hal.command_buffer
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %1 = util.cmp.eq %arg0, %__device_0 : !hal.device
+    %2 = scf.if %1 -> (!hal.command_buffer) {
+      %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+      scf.yield %__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    } else {
+      scf.yield %0 : !hal.command_buffer
+    }
+    util.return %2 : !hal.command_buffer
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c2 = arith.constant 2 : index
+    %c8 = arith.constant 8 : index
+    %c64 = arith.constant 64 : index
+    %c128 = arith.constant 128 : index
+    %c0 = arith.constant 0 : index
+    %c-1_i64 = arith.constant -1 : i64
+    %0 = util.null : !hal.fence
+    %c0_i64 = arith.constant 0 : i64
+    %c-1_i32 = arith.constant -1 : i32
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
+    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
+    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
+    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    %memory_types, %buffer_usage = hal.allocator.resolve_memory_properties for(#hal.device.affinity<@__device_0>) lifetime(external) : i32, i32
+    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_types) usage(%buffer_usage) flags("None") : !hal.buffer{%c128}
+    %1 = util.call @__multiple_results_memoize_lookup(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer
+    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+    hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%1) bindings([
+      (%buffer : !hal.buffer)[%c0, %c8], 
+      (%buffer_0 : !hal.buffer)[%c0, %c8], 
+      (%transient_buffer : !hal.buffer)[%c0, %c128]
+    ]) flags("None")
+    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32
+    util.status.check_ok %status, "failed to wait on timepoint"
+    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+    %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+    util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After PruneExecutablesPass (iree-hal-prune-executables) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module {
+  util.global private @__device_0 = #device_target_local
+  hal.executable private @multiple_results_dispatch_0 {
+    hal.executable.variant public @embedded_elf_arm_64 target(#executable_target_embedded_elf_arm_64) {
+      hal.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 ordinal(0) layout(#pipeline_layout) attributes {workgroup_size = [1 : index, 1 : index, 1 : index]}
+      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", llvm.target_triple = "arm64-unknown-unknown-eabi-elf"} {
+        llvm.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}) -> i32 {
+          %0 = llvm.mlir.constant(0 : i32) : i32
+          %1 = llvm.mlir.constant(8 : i64) : i64
+          %2 = llvm.mlir.constant(32 : i64) : i64
+          %3 = llvm.mlir.constant(64 : index) : i64
+          %4 = llvm.mlir.constant(true) : i1
+          %5 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
+          %6 = llvm.extractvalue %5[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+          %7 = llvm.load %6 : !llvm.ptr -> i32
+          %8 = llvm.zext %7 : i32 to i64
+          %9 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+          %10 = llvm.load %9 : !llvm.ptr -> !llvm.ptr
+          llvm.intr.assume %4 ["align"(%10, %3 : !llvm.ptr, i64)] : i1
+          %11 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
+          %12 = llvm.extractvalue %11[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+          %13 = llvm.getelementptr %12[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
+          %14 = llvm.load %13 : !llvm.ptr -> !llvm.ptr
+          %15 = llvm.mul %8, %1 : i64
+          %16 = llvm.udiv %15, %2 : i64
+          %17 = llvm.getelementptr %14[%16] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+          llvm.intr.assume %4 ["align"(%17, %3 : !llvm.ptr, i64)] : i1
+          %18 = llvm.load %10 {alignment = 4 : i64} : !llvm.ptr -> vector<2xf32>
+          %19 = llvm.intr.fabs(%18) : (vector<2xf32>) -> vector<2xf32>
+          llvm.store %19, %17 {alignment = 4 : i64} : vector<2xf32>, !llvm.ptr
+          llvm.return %0 : i32
+        }
+      }
+    }
+  }
+  util.func private @__multiple_results_memoize_apply(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} {
+    %c3 = arith.constant 3 : index
+    %c0_i32 = arith.constant 0 : i32
+    %c0 = arith.constant 0 : index
+    %c2 = arith.constant 2 : index
+    %c8 = arith.constant 8 : index
+    %c128 = arith.constant 128 : index
+    %c64_i32 = arith.constant 64 : i32
+    %c1 = arith.constant 1 : index
+    %cmd = hal.command_buffer.create device(%arg0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%arg1) bindings(%c3) : !hal.command_buffer
+    %exe = hal.executable.lookup device(%arg0 : !hal.device) executable(@multiple_results_dispatch_0) : !hal.executable
+    %ordinal = hal.executable.export.ordinal target(@multiple_results_dispatch_0::@embedded_elf_arm_64::@multiple_results_dispatch_0_elementwise_2_f32) : index
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([
+      (%c0 : index)[%c0, %c8], 
+      (%c2 : index)[%c0, %c128]
+    ]) flags("None")
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([
+      (%c1 : index)[%c0, %c8], 
+      (%c2 : index)[%c0, %c128]
+    ]) flags("None")
+    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
+    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
+    util.return %cmd : !hal.command_buffer
+  }
+  util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  util.initializer {
+    %c-1_i64 = arith.constant -1 : i64
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %0 = util.call @__multiple_results_memoize_apply(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer
+    util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    util.return
+  }
+  util.func private @__multiple_results_memoize_lookup(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer {
+    %0 = util.null : !hal.command_buffer
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %1 = util.cmp.eq %arg0, %__device_0 : !hal.device
+    %2 = scf.if %1 -> (!hal.command_buffer) {
+      %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+      scf.yield %__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    } else {
+      scf.yield %0 : !hal.command_buffer
+    }
+    util.return %2 : !hal.command_buffer
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c2 = arith.constant 2 : index
+    %c8 = arith.constant 8 : index
+    %c64 = arith.constant 64 : index
+    %c128 = arith.constant 128 : index
+    %c0 = arith.constant 0 : index
+    %c-1_i64 = arith.constant -1 : i64
+    %0 = util.null : !hal.fence
+    %c0_i64 = arith.constant 0 : i64
+    %c-1_i32 = arith.constant -1 : i32
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
+    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
+    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
+    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    %memory_types, %buffer_usage = hal.allocator.resolve_memory_properties for(#hal.device.affinity<@__device_0>) lifetime(external) : i32, i32
+    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_types) usage(%buffer_usage) flags("None") : !hal.buffer{%c128}
+    %1 = util.call @__multiple_results_memoize_lookup(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer
+    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+    hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%1) bindings([
+      (%buffer : !hal.buffer)[%c0, %c8], 
+      (%buffer_0 : !hal.buffer)[%c0, %c8], 
+      (%transient_buffer : !hal.buffer)[%c0, %c128]
+    ]) flags("None")
+    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32
+    util.status.check_ok %status, "failed to wait on timepoint"
+    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+    %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+    util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After LLVMCPULinkExecutablesPass (iree-llvmcpu-link-executables) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module {
+  util.global private @__device_0 = #device_target_local
+  hal.executable private @multiple_results_dispatch_0 {
+    hal.executable.variant public @embedded_elf_arm_64 target(#executable_target_embedded_elf_arm_64) {
+      hal.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 ordinal(0) layout(#pipeline_layout) attributes {workgroup_size = [1 : index, 1 : index, 1 : index]}
+      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", llvm.target_triple = "arm64-unknown-unknown-eabi-elf"} {
+        llvm.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}) -> i32 {
+          %0 = llvm.mlir.constant(0 : i32) : i32
+          %1 = llvm.mlir.constant(8 : i64) : i64
+          %2 = llvm.mlir.constant(32 : i64) : i64
+          %3 = llvm.mlir.constant(64 : index) : i64
+          %4 = llvm.mlir.constant(true) : i1
+          %5 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
+          %6 = llvm.extractvalue %5[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+          %7 = llvm.load %6 : !llvm.ptr -> i32
+          %8 = llvm.zext %7 : i32 to i64
+          %9 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+          %10 = llvm.load %9 : !llvm.ptr -> !llvm.ptr
+          llvm.intr.assume %4 ["align"(%10, %3 : !llvm.ptr, i64)] : i1
+          %11 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
+          %12 = llvm.extractvalue %11[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+          %13 = llvm.getelementptr %12[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
+          %14 = llvm.load %13 : !llvm.ptr -> !llvm.ptr
+          %15 = llvm.mul %8, %1 : i64
+          %16 = llvm.udiv %15, %2 : i64
+          %17 = llvm.getelementptr %14[%16] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+          llvm.intr.assume %4 ["align"(%17, %3 : !llvm.ptr, i64)] : i1
+          %18 = llvm.load %10 {alignment = 4 : i64} : !llvm.ptr -> vector<2xf32>
+          %19 = llvm.intr.fabs(%18) : (vector<2xf32>) -> vector<2xf32>
+          llvm.store %19, %17 {alignment = 4 : i64} : vector<2xf32>, !llvm.ptr
+          llvm.return %0 : i32
+        }
+      }
+    }
+  }
+  util.func private @__multiple_results_memoize_apply(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} {
+    %c3 = arith.constant 3 : index
+    %c0_i32 = arith.constant 0 : i32
+    %c0 = arith.constant 0 : index
+    %c2 = arith.constant 2 : index
+    %c8 = arith.constant 8 : index
+    %c128 = arith.constant 128 : index
+    %c64_i32 = arith.constant 64 : i32
+    %c1 = arith.constant 1 : index
+    %cmd = hal.command_buffer.create device(%arg0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%arg1) bindings(%c3) : !hal.command_buffer
+    %exe = hal.executable.lookup device(%arg0 : !hal.device) executable(@multiple_results_dispatch_0) : !hal.executable
+    %ordinal = hal.executable.export.ordinal target(@multiple_results_dispatch_0::@embedded_elf_arm_64::@multiple_results_dispatch_0_elementwise_2_f32) : index
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([
+      (%c0 : index)[%c0, %c8], 
+      (%c2 : index)[%c0, %c128]
+    ]) flags("None")
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([
+      (%c1 : index)[%c0, %c8], 
+      (%c2 : index)[%c0, %c128]
+    ]) flags("None")
+    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
+    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
+    util.return %cmd : !hal.command_buffer
+  }
+  util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  util.initializer {
+    %c-1_i64 = arith.constant -1 : i64
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %0 = util.call @__multiple_results_memoize_apply(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer
+    util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    util.return
+  }
+  util.func private @__multiple_results_memoize_lookup(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer {
+    %0 = util.null : !hal.command_buffer
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %1 = util.cmp.eq %arg0, %__device_0 : !hal.device
+    %2 = scf.if %1 -> (!hal.command_buffer) {
+      %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+      scf.yield %__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    } else {
+      scf.yield %0 : !hal.command_buffer
+    }
+    util.return %2 : !hal.command_buffer
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c2 = arith.constant 2 : index
+    %c8 = arith.constant 8 : index
+    %c64 = arith.constant 64 : index
+    %c128 = arith.constant 128 : index
+    %c0 = arith.constant 0 : index
+    %c-1_i64 = arith.constant -1 : i64
+    %0 = util.null : !hal.fence
+    %c0_i64 = arith.constant 0 : i64
+    %c-1_i32 = arith.constant -1 : i32
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
+    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
+    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
+    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    %memory_types, %buffer_usage = hal.allocator.resolve_memory_properties for(#hal.device.affinity<@__device_0>) lifetime(external) : i32, i32
+    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_types) usage(%buffer_usage) flags("None") : !hal.buffer{%c128}
+    %1 = util.call @__multiple_results_memoize_lookup(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer
+    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+    hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%1) bindings([
+      (%buffer : !hal.buffer)[%c0, %c8], 
+      (%buffer_0 : !hal.buffer)[%c0, %c8], 
+      (%transient_buffer : !hal.buffer)[%c0, %c128]
+    ]) flags("None")
+    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32
+    util.status.check_ok %status, "failed to wait on timepoint"
+    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+    %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+    util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+hal.executable private @multiple_results_dispatch_0 {
+  hal.executable.variant public @embedded_elf_arm_64 target(<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>) {
+    hal.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 ordinal(0) layout(#hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) attributes {workgroup_size = [1 : index, 1 : index, 1 : index]}
+    builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", llvm.target_triple = "arm64-unknown-unknown-eabi-elf"} {
+      llvm.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}) -> i32 {
+        %0 = llvm.mlir.constant(0 : i32) : i32
+        %1 = llvm.mlir.constant(8 : i64) : i64
+        %2 = llvm.mlir.constant(32 : i64) : i64
+        %3 = llvm.mlir.constant(64 : index) : i64
+        %4 = llvm.mlir.constant(true) : i1
+        %5 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
+        %6 = llvm.extractvalue %5[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+        %7 = llvm.load %6 : !llvm.ptr -> i32
+        %8 = llvm.zext %7 : i32 to i64
+        %9 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+        %10 = llvm.load %9 : !llvm.ptr -> !llvm.ptr
+        llvm.intr.assume %4 ["align"(%10, %3 : !llvm.ptr, i64)] : i1
+        %11 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
+        %12 = llvm.extractvalue %11[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+        %13 = llvm.getelementptr %12[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
+        %14 = llvm.load %13 : !llvm.ptr -> !llvm.ptr
+        %15 = llvm.mul %8, %1 : i64
+        %16 = llvm.udiv %15, %2 : i64
+        %17 = llvm.getelementptr %14[%16] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+        llvm.intr.assume %4 ["align"(%17, %3 : !llvm.ptr, i64)] : i1
+        %18 = llvm.load %10 {alignment = 4 : i64} : !llvm.ptr -> vector<2xf32>
+        %19 = llvm.intr.fabs(%18) : (vector<2xf32>) -> vector<2xf32>
+        llvm.store %19, %17 {alignment = 4 : i64} : vector<2xf32>, !llvm.ptr
+        llvm.return %0 : i32
+      }
+    }
+  }
+}
+
+// -----// IR Dump After LLVMCPUAssignConstantOrdinalsPass (iree-llvmcpu-assign-constant-ordinals) //----- //
+hal.executable.variant public @embedded_elf_arm_64 target(<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>) {
+  hal.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 ordinal(0) layout(#hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) attributes {workgroup_size = [1 : index, 1 : index, 1 : index]}
+  builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", llvm.target_triple = "arm64-unknown-unknown-eabi-elf"} {
+    llvm.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}) -> i32 {
+      %0 = llvm.mlir.constant(0 : i32) : i32
+      %1 = llvm.mlir.constant(8 : i64) : i64
+      %2 = llvm.mlir.constant(32 : i64) : i64
+      %3 = llvm.mlir.constant(64 : index) : i64
+      %4 = llvm.mlir.constant(true) : i1
+      %5 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
+      %6 = llvm.extractvalue %5[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+      %7 = llvm.load %6 : !llvm.ptr -> i32
+      %8 = llvm.zext %7 : i32 to i64
+      %9 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+      %10 = llvm.load %9 : !llvm.ptr -> !llvm.ptr
+      llvm.intr.assume %4 ["align"(%10, %3 : !llvm.ptr, i64)] : i1
+      %11 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
+      %12 = llvm.extractvalue %11[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+      %13 = llvm.getelementptr %12[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
+      %14 = llvm.load %13 : !llvm.ptr -> !llvm.ptr
+      %15 = llvm.mul %8, %1 : i64
+      %16 = llvm.udiv %15, %2 : i64
+      %17 = llvm.getelementptr %14[%16] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      llvm.intr.assume %4 ["align"(%17, %3 : !llvm.ptr, i64)] : i1
+      %18 = llvm.load %10 {alignment = 4 : i64} : !llvm.ptr -> vector<2xf32>
+      %19 = llvm.intr.fabs(%18) : (vector<2xf32>) -> vector<2xf32>
+      llvm.store %19, %17 {alignment = 4 : i64} : vector<2xf32>, !llvm.ptr
+      llvm.return %0 : i32
+    }
+  }
+}
+
+// -----// IR Dump After LLVMCPUAssignImportOrdinalsPass (iree-llvmcpu-assign-import-ordinals) //----- //
+hal.executable.variant public @embedded_elf_arm_64 target(<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>) {
+  hal.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 ordinal(0) layout(#hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) attributes {workgroup_size = [1 : index, 1 : index, 1 : index]}
+  builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", llvm.target_triple = "arm64-unknown-unknown-eabi-elf"} {
+    llvm.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}) -> i32 {
+      %0 = llvm.mlir.constant(0 : i32) : i32
+      %1 = llvm.mlir.constant(8 : i64) : i64
+      %2 = llvm.mlir.constant(32 : i64) : i64
+      %3 = llvm.mlir.constant(64 : index) : i64
+      %4 = llvm.mlir.constant(true) : i1
+      %5 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
+      %6 = llvm.extractvalue %5[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+      %7 = llvm.load %6 : !llvm.ptr -> i32
+      %8 = llvm.zext %7 : i32 to i64
+      %9 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+      %10 = llvm.load %9 : !llvm.ptr -> !llvm.ptr
+      llvm.intr.assume %4 ["align"(%10, %3 : !llvm.ptr, i64)] : i1
+      %11 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
+      %12 = llvm.extractvalue %11[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+      %13 = llvm.getelementptr %12[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
+      %14 = llvm.load %13 : !llvm.ptr -> !llvm.ptr
+      %15 = llvm.mul %8, %1 : i64
+      %16 = llvm.udiv %15, %2 : i64
+      %17 = llvm.getelementptr %14[%16] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      llvm.intr.assume %4 ["align"(%17, %3 : !llvm.ptr, i64)] : i1
+      %18 = llvm.load %10 {alignment = 4 : i64} : !llvm.ptr -> vector<2xf32>
+      %19 = llvm.intr.fabs(%18) : (vector<2xf32>) -> vector<2xf32>
+      llvm.store %19, %17 {alignment = 4 : i64} : vector<2xf32>, !llvm.ptr
+      llvm.return %0 : i32
+    }
+  }
+}
+
+// -----// IR Dump After LinkTargetExecutablesPass (iree-hal-link-target-executables) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module {
+  util.global private @__device_0 = #device_target_local
+  hal.executable private @multiple_results_dispatch_0 {
+    hal.executable.variant public @embedded_elf_arm_64 target(#executable_target_embedded_elf_arm_64) {
+      hal.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 ordinal(0) layout(#pipeline_layout) attributes {workgroup_size = [1 : index, 1 : index, 1 : index]}
+      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", llvm.target_triple = "arm64-unknown-unknown-eabi-elf"} {
+        llvm.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}) -> i32 {
+          %0 = llvm.mlir.constant(0 : i32) : i32
+          %1 = llvm.mlir.constant(8 : i64) : i64
+          %2 = llvm.mlir.constant(32 : i64) : i64
+          %3 = llvm.mlir.constant(64 : index) : i64
+          %4 = llvm.mlir.constant(true) : i1
+          %5 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
+          %6 = llvm.extractvalue %5[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+          %7 = llvm.load %6 : !llvm.ptr -> i32
+          %8 = llvm.zext %7 : i32 to i64
+          %9 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+          %10 = llvm.load %9 : !llvm.ptr -> !llvm.ptr
+          llvm.intr.assume %4 ["align"(%10, %3 : !llvm.ptr, i64)] : i1
+          %11 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
+          %12 = llvm.extractvalue %11[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+          %13 = llvm.getelementptr %12[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
+          %14 = llvm.load %13 : !llvm.ptr -> !llvm.ptr
+          %15 = llvm.mul %8, %1 : i64
+          %16 = llvm.udiv %15, %2 : i64
+          %17 = llvm.getelementptr %14[%16] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+          llvm.intr.assume %4 ["align"(%17, %3 : !llvm.ptr, i64)] : i1
+          %18 = llvm.load %10 {alignment = 4 : i64} : !llvm.ptr -> vector<2xf32>
+          %19 = llvm.intr.fabs(%18) : (vector<2xf32>) -> vector<2xf32>
+          llvm.store %19, %17 {alignment = 4 : i64} : vector<2xf32>, !llvm.ptr
+          llvm.return %0 : i32
+        }
+      }
+    }
+  }
+  util.func private @__multiple_results_memoize_apply(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} {
+    %c3 = arith.constant 3 : index
+    %c0_i32 = arith.constant 0 : i32
+    %c0 = arith.constant 0 : index
+    %c2 = arith.constant 2 : index
+    %c8 = arith.constant 8 : index
+    %c128 = arith.constant 128 : index
+    %c64_i32 = arith.constant 64 : i32
+    %c1 = arith.constant 1 : index
+    %cmd = hal.command_buffer.create device(%arg0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%arg1) bindings(%c3) : !hal.command_buffer
+    %exe = hal.executable.lookup device(%arg0 : !hal.device) executable(@multiple_results_dispatch_0) : !hal.executable
+    %ordinal = hal.executable.export.ordinal target(@multiple_results_dispatch_0::@embedded_elf_arm_64::@multiple_results_dispatch_0_elementwise_2_f32) : index
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([
+      (%c0 : index)[%c0, %c8], 
+      (%c2 : index)[%c0, %c128]
+    ]) flags("None")
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([
+      (%c1 : index)[%c0, %c8], 
+      (%c2 : index)[%c0, %c128]
+    ]) flags("None")
+    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
+    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
+    util.return %cmd : !hal.command_buffer
+  }
+  util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  util.initializer {
+    %c-1_i64 = arith.constant -1 : i64
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %0 = util.call @__multiple_results_memoize_apply(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer
+    util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    util.return
+  }
+  util.func private @__multiple_results_memoize_lookup(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer {
+    %0 = util.null : !hal.command_buffer
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %1 = util.cmp.eq %arg0, %__device_0 : !hal.device
+    %2 = scf.if %1 -> (!hal.command_buffer) {
+      %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+      scf.yield %__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    } else {
+      scf.yield %0 : !hal.command_buffer
+    }
+    util.return %2 : !hal.command_buffer
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c2 = arith.constant 2 : index
+    %c8 = arith.constant 8 : index
+    %c64 = arith.constant 64 : index
+    %c128 = arith.constant 128 : index
+    %c0 = arith.constant 0 : index
+    %c-1_i64 = arith.constant -1 : i64
+    %0 = util.null : !hal.fence
+    %c0_i64 = arith.constant 0 : i64
+    %c-1_i32 = arith.constant -1 : i32
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
+    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
+    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
+    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    %memory_types, %buffer_usage = hal.allocator.resolve_memory_properties for(#hal.device.affinity<@__device_0>) lifetime(external) : i32, i32
+    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_types) usage(%buffer_usage) flags("None") : !hal.buffer{%c128}
+    %1 = util.call @__multiple_results_memoize_lookup(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer
+    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+    hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%1) bindings([
+      (%buffer : !hal.buffer)[%c0, %c8], 
+      (%buffer_0 : !hal.buffer)[%c0, %c8], 
+      (%transient_buffer : !hal.buffer)[%c0, %c128]
+    ]) flags("None")
+    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32
+    util.status.check_ok %status, "failed to wait on timepoint"
+    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+    %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+    util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After SymbolDCE (symbol-dce) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module {
+  util.global private @__device_0 = #device_target_local
+  hal.executable private @multiple_results_dispatch_0 {
+    hal.executable.variant public @embedded_elf_arm_64 target(#executable_target_embedded_elf_arm_64) {
+      hal.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 ordinal(0) layout(#pipeline_layout) attributes {workgroup_size = [1 : index, 1 : index, 1 : index]}
+      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", llvm.target_triple = "arm64-unknown-unknown-eabi-elf"} {
+        llvm.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}) -> i32 {
+          %0 = llvm.mlir.constant(0 : i32) : i32
+          %1 = llvm.mlir.constant(8 : i64) : i64
+          %2 = llvm.mlir.constant(32 : i64) : i64
+          %3 = llvm.mlir.constant(64 : index) : i64
+          %4 = llvm.mlir.constant(true) : i1
+          %5 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
+          %6 = llvm.extractvalue %5[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+          %7 = llvm.load %6 : !llvm.ptr -> i32
+          %8 = llvm.zext %7 : i32 to i64
+          %9 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+          %10 = llvm.load %9 : !llvm.ptr -> !llvm.ptr
+          llvm.intr.assume %4 ["align"(%10, %3 : !llvm.ptr, i64)] : i1
+          %11 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
+          %12 = llvm.extractvalue %11[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+          %13 = llvm.getelementptr %12[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
+          %14 = llvm.load %13 : !llvm.ptr -> !llvm.ptr
+          %15 = llvm.mul %8, %1 : i64
+          %16 = llvm.udiv %15, %2 : i64
+          %17 = llvm.getelementptr %14[%16] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+          llvm.intr.assume %4 ["align"(%17, %3 : !llvm.ptr, i64)] : i1
+          %18 = llvm.load %10 {alignment = 4 : i64} : !llvm.ptr -> vector<2xf32>
+          %19 = llvm.intr.fabs(%18) : (vector<2xf32>) -> vector<2xf32>
+          llvm.store %19, %17 {alignment = 4 : i64} : vector<2xf32>, !llvm.ptr
+          llvm.return %0 : i32
+        }
+      }
+    }
+  }
+  util.func private @__multiple_results_memoize_apply(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} {
+    %c3 = arith.constant 3 : index
+    %c0_i32 = arith.constant 0 : i32
+    %c0 = arith.constant 0 : index
+    %c2 = arith.constant 2 : index
+    %c8 = arith.constant 8 : index
+    %c128 = arith.constant 128 : index
+    %c64_i32 = arith.constant 64 : i32
+    %c1 = arith.constant 1 : index
+    %cmd = hal.command_buffer.create device(%arg0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%arg1) bindings(%c3) : !hal.command_buffer
+    %exe = hal.executable.lookup device(%arg0 : !hal.device) executable(@multiple_results_dispatch_0) : !hal.executable
+    %ordinal = hal.executable.export.ordinal target(@multiple_results_dispatch_0::@embedded_elf_arm_64::@multiple_results_dispatch_0_elementwise_2_f32) : index
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([
+      (%c0 : index)[%c0, %c8], 
+      (%c2 : index)[%c0, %c128]
+    ]) flags("None")
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([
+      (%c1 : index)[%c0, %c8], 
+      (%c2 : index)[%c0, %c128]
+    ]) flags("None")
+    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
+    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
+    util.return %cmd : !hal.command_buffer
+  }
+  util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  util.initializer {
+    %c-1_i64 = arith.constant -1 : i64
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %0 = util.call @__multiple_results_memoize_apply(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer
+    util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    util.return
+  }
+  util.func private @__multiple_results_memoize_lookup(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer {
+    %0 = util.null : !hal.command_buffer
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %1 = util.cmp.eq %arg0, %__device_0 : !hal.device
+    %2 = scf.if %1 -> (!hal.command_buffer) {
+      %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+      scf.yield %__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    } else {
+      scf.yield %0 : !hal.command_buffer
+    }
+    util.return %2 : !hal.command_buffer
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c2 = arith.constant 2 : index
+    %c8 = arith.constant 8 : index
+    %c64 = arith.constant 64 : index
+    %c128 = arith.constant 128 : index
+    %c0 = arith.constant 0 : index
+    %c-1_i64 = arith.constant -1 : i64
+    %0 = util.null : !hal.fence
+    %c0_i64 = arith.constant 0 : i64
+    %c-1_i32 = arith.constant -1 : i32
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
+    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
+    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
+    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    %memory_types, %buffer_usage = hal.allocator.resolve_memory_properties for(#hal.device.affinity<@__device_0>) lifetime(external) : i32, i32
+    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_types) usage(%buffer_usage) flags("None") : !hal.buffer{%c128}
+    %1 = util.call @__multiple_results_memoize_lookup(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer
+    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+    hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%1) bindings([
+      (%buffer : !hal.buffer)[%c0, %c8], 
+      (%buffer_0 : !hal.buffer)[%c0, %c8], 
+      (%transient_buffer : !hal.buffer)[%c0, %c128]
+    ]) flags("None")
+    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32
+    util.status.check_ok %status, "failed to wait on timepoint"
+    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+    %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+    util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After LinkAllExecutablesPass (iree-hal-link-all-executables) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module {
+  util.global private @__device_0 = #device_target_local
+  hal.executable private @multiple_results_dispatch_0 {
+    hal.executable.variant public @embedded_elf_arm_64 target(#executable_target_embedded_elf_arm_64) {
+      hal.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 ordinal(0) layout(#pipeline_layout) attributes {workgroup_size = [1 : index, 1 : index, 1 : index]}
+      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", llvm.target_triple = "arm64-unknown-unknown-eabi-elf"} {
+        llvm.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}) -> i32 {
+          %0 = llvm.mlir.constant(0 : i32) : i32
+          %1 = llvm.mlir.constant(8 : i64) : i64
+          %2 = llvm.mlir.constant(32 : i64) : i64
+          %3 = llvm.mlir.constant(64 : index) : i64
+          %4 = llvm.mlir.constant(true) : i1
+          %5 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
+          %6 = llvm.extractvalue %5[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+          %7 = llvm.load %6 : !llvm.ptr -> i32
+          %8 = llvm.zext %7 : i32 to i64
+          %9 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+          %10 = llvm.load %9 : !llvm.ptr -> !llvm.ptr
+          llvm.intr.assume %4 ["align"(%10, %3 : !llvm.ptr, i64)] : i1
+          %11 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
+          %12 = llvm.extractvalue %11[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+          %13 = llvm.getelementptr %12[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
+          %14 = llvm.load %13 : !llvm.ptr -> !llvm.ptr
+          %15 = llvm.mul %8, %1 : i64
+          %16 = llvm.udiv %15, %2 : i64
+          %17 = llvm.getelementptr %14[%16] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+          llvm.intr.assume %4 ["align"(%17, %3 : !llvm.ptr, i64)] : i1
+          %18 = llvm.load %10 {alignment = 4 : i64} : !llvm.ptr -> vector<2xf32>
+          %19 = llvm.intr.fabs(%18) : (vector<2xf32>) -> vector<2xf32>
+          llvm.store %19, %17 {alignment = 4 : i64} : vector<2xf32>, !llvm.ptr
+          llvm.return %0 : i32
+        }
+      }
+    }
+  }
+  util.func private @__multiple_results_memoize_apply(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} {
+    %c3 = arith.constant 3 : index
+    %c0_i32 = arith.constant 0 : i32
+    %c0 = arith.constant 0 : index
+    %c2 = arith.constant 2 : index
+    %c8 = arith.constant 8 : index
+    %c128 = arith.constant 128 : index
+    %c64_i32 = arith.constant 64 : i32
+    %c1 = arith.constant 1 : index
+    %cmd = hal.command_buffer.create device(%arg0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%arg1) bindings(%c3) : !hal.command_buffer
+    %exe = hal.executable.lookup device(%arg0 : !hal.device) executable(@multiple_results_dispatch_0) : !hal.executable
+    %ordinal = hal.executable.export.ordinal target(@multiple_results_dispatch_0::@embedded_elf_arm_64::@multiple_results_dispatch_0_elementwise_2_f32) : index
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([
+      (%c0 : index)[%c0, %c8], 
+      (%c2 : index)[%c0, %c128]
+    ]) flags("None")
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([
+      (%c1 : index)[%c0, %c8], 
+      (%c2 : index)[%c0, %c128]
+    ]) flags("None")
+    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
+    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
+    util.return %cmd : !hal.command_buffer
+  }
+  util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  util.initializer {
+    %c-1_i64 = arith.constant -1 : i64
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %0 = util.call @__multiple_results_memoize_apply(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer
+    util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    util.return
+  }
+  util.func private @__multiple_results_memoize_lookup(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer {
+    %0 = util.null : !hal.command_buffer
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %1 = util.cmp.eq %arg0, %__device_0 : !hal.device
+    %2 = scf.if %1 -> (!hal.command_buffer) {
+      %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+      scf.yield %__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    } else {
+      scf.yield %0 : !hal.command_buffer
+    }
+    util.return %2 : !hal.command_buffer
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c2 = arith.constant 2 : index
+    %c8 = arith.constant 8 : index
+    %c64 = arith.constant 64 : index
+    %c128 = arith.constant 128 : index
+    %c0 = arith.constant 0 : index
+    %c-1_i64 = arith.constant -1 : i64
+    %0 = util.null : !hal.fence
+    %c0_i64 = arith.constant 0 : i64
+    %c-1_i32 = arith.constant -1 : i32
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
+    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
+    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
+    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    %memory_types, %buffer_usage = hal.allocator.resolve_memory_properties for(#hal.device.affinity<@__device_0>) lifetime(external) : i32, i32
+    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_types) usage(%buffer_usage) flags("None") : !hal.buffer{%c128}
+    %1 = util.call @__multiple_results_memoize_lookup(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer
+    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+    hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%1) bindings([
+      (%buffer : !hal.buffer)[%c0, %c8], 
+      (%buffer_0 : !hal.buffer)[%c0, %c8], 
+      (%transient_buffer : !hal.buffer)[%c0, %c128]
+    ]) flags("None")
+    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32
+    util.status.check_ok %status, "failed to wait on timepoint"
+    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+    %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+    util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After HoistExecutableObjectsPass (iree-hal-hoist-executable-objects) //----- //
+hal.executable.variant public @embedded_elf_arm_64 target(<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>) {
+  hal.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 ordinal(0) layout(#hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) attributes {workgroup_size = [1 : index, 1 : index, 1 : index]}
+  builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", llvm.target_triple = "arm64-unknown-unknown-eabi-elf"} {
+    llvm.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}) -> i32 {
+      %0 = llvm.mlir.constant(0 : i32) : i32
+      %1 = llvm.mlir.constant(8 : i64) : i64
+      %2 = llvm.mlir.constant(32 : i64) : i64
+      %3 = llvm.mlir.constant(64 : index) : i64
+      %4 = llvm.mlir.constant(true) : i1
+      %5 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
+      %6 = llvm.extractvalue %5[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+      %7 = llvm.load %6 : !llvm.ptr -> i32
+      %8 = llvm.zext %7 : i32 to i64
+      %9 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+      %10 = llvm.load %9 : !llvm.ptr -> !llvm.ptr
+      llvm.intr.assume %4 ["align"(%10, %3 : !llvm.ptr, i64)] : i1
+      %11 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
+      %12 = llvm.extractvalue %11[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+      %13 = llvm.getelementptr %12[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
+      %14 = llvm.load %13 : !llvm.ptr -> !llvm.ptr
+      %15 = llvm.mul %8, %1 : i64
+      %16 = llvm.udiv %15, %2 : i64
+      %17 = llvm.getelementptr %14[%16] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      llvm.intr.assume %4 ["align"(%17, %3 : !llvm.ptr, i64)] : i1
+      %18 = llvm.load %10 {alignment = 4 : i64} : !llvm.ptr -> vector<2xf32>
+      %19 = llvm.intr.fabs(%18) : (vector<2xf32>) -> vector<2xf32>
+      llvm.store %19, %17 {alignment = 4 : i64} : vector<2xf32>, !llvm.ptr
+      llvm.return %0 : i32
+    }
+  }
+}
+
+// -----// IR Dump After ResolveExportOrdinalsPass (iree-hal-resolve-export-ordinals) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module {
+  util.global private @__device_0 = #device_target_local
+  hal.executable private @multiple_results_dispatch_0 {
+    hal.executable.variant public @embedded_elf_arm_64 target(#executable_target_embedded_elf_arm_64) {
+      hal.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 ordinal(0) layout(#pipeline_layout) attributes {workgroup_size = [1 : index, 1 : index, 1 : index]}
+      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", llvm.target_triple = "arm64-unknown-unknown-eabi-elf"} {
+        llvm.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}) -> i32 {
+          %0 = llvm.mlir.constant(0 : i32) : i32
+          %1 = llvm.mlir.constant(8 : i64) : i64
+          %2 = llvm.mlir.constant(32 : i64) : i64
+          %3 = llvm.mlir.constant(64 : index) : i64
+          %4 = llvm.mlir.constant(true) : i1
+          %5 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
+          %6 = llvm.extractvalue %5[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+          %7 = llvm.load %6 : !llvm.ptr -> i32
+          %8 = llvm.zext %7 : i32 to i64
+          %9 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+          %10 = llvm.load %9 : !llvm.ptr -> !llvm.ptr
+          llvm.intr.assume %4 ["align"(%10, %3 : !llvm.ptr, i64)] : i1
+          %11 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
+          %12 = llvm.extractvalue %11[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+          %13 = llvm.getelementptr %12[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
+          %14 = llvm.load %13 : !llvm.ptr -> !llvm.ptr
+          %15 = llvm.mul %8, %1 : i64
+          %16 = llvm.udiv %15, %2 : i64
+          %17 = llvm.getelementptr %14[%16] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+          llvm.intr.assume %4 ["align"(%17, %3 : !llvm.ptr, i64)] : i1
+          %18 = llvm.load %10 {alignment = 4 : i64} : !llvm.ptr -> vector<2xf32>
+          %19 = llvm.intr.fabs(%18) : (vector<2xf32>) -> vector<2xf32>
+          llvm.store %19, %17 {alignment = 4 : i64} : vector<2xf32>, !llvm.ptr
+          llvm.return %0 : i32
+        }
+      }
+    }
+  }
+  util.func private @__multiple_results_memoize_apply(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} {
+    %c3 = arith.constant 3 : index
+    %c0_i32 = arith.constant 0 : i32
+    %c0 = arith.constant 0 : index
+    %c2 = arith.constant 2 : index
+    %c8 = arith.constant 8 : index
+    %c128 = arith.constant 128 : index
+    %c64_i32 = arith.constant 64 : i32
+    %c1 = arith.constant 1 : index
+    %cmd = hal.command_buffer.create device(%arg0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%arg1) bindings(%c3) : !hal.command_buffer
+    %exe = hal.executable.lookup device(%arg0 : !hal.device) executable(@multiple_results_dispatch_0) : !hal.executable
+    %c0_0 = arith.constant 0 : index
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%c0_0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([
+      (%c0 : index)[%c0, %c8], 
+      (%c2 : index)[%c0, %c128]
+    ]) flags("None")
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%c0_0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([
+      (%c1 : index)[%c0, %c8], 
+      (%c2 : index)[%c0, %c128]
+    ]) flags("None")
+    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
+    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
+    util.return %cmd : !hal.command_buffer
+  }
+  util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  util.initializer {
+    %c-1_i64 = arith.constant -1 : i64
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %0 = util.call @__multiple_results_memoize_apply(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer
+    util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    util.return
+  }
+  util.func private @__multiple_results_memoize_lookup(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer {
+    %0 = util.null : !hal.command_buffer
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %1 = util.cmp.eq %arg0, %__device_0 : !hal.device
+    %2 = scf.if %1 -> (!hal.command_buffer) {
+      %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+      scf.yield %__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    } else {
+      scf.yield %0 : !hal.command_buffer
+    }
+    util.return %2 : !hal.command_buffer
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c2 = arith.constant 2 : index
+    %c8 = arith.constant 8 : index
+    %c64 = arith.constant 64 : index
+    %c128 = arith.constant 128 : index
+    %c0 = arith.constant 0 : index
+    %c-1_i64 = arith.constant -1 : i64
+    %0 = util.null : !hal.fence
+    %c0_i64 = arith.constant 0 : i64
+    %c-1_i32 = arith.constant -1 : i32
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
+    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
+    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
+    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    %memory_types, %buffer_usage = hal.allocator.resolve_memory_properties for(#hal.device.affinity<@__device_0>) lifetime(external) : i32, i32
+    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_types) usage(%buffer_usage) flags("None") : !hal.buffer{%c128}
+    %1 = util.call @__multiple_results_memoize_lookup(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer
+    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+    hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%1) bindings([
+      (%buffer : !hal.buffer)[%c0, %c8], 
+      (%buffer_0 : !hal.buffer)[%c0, %c8], 
+      (%transient_buffer : !hal.buffer)[%c0, %c128]
+    ]) flags("None")
+    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32
+    util.status.check_ok %status, "failed to wait on timepoint"
+    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+    %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+    util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After MaterializeResourceCachesPass (iree-hal-materialize-resource-caches) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module {
+  util.global private @__device_0 = #device_target_local
+  util.global private @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  util.initializer {
+    %__device_0 = util.global.load @__device_0 : !hal.device
+    %c-1_i64 = arith.constant -1 : i64
+    %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+    %c-1 = arith.constant -1 : index
+    %c0 = arith.constant 0 : index
+    %0 = arith.select %value, %c0, %c-1 : index
+    %1 = scf.index_switch %0 -> !hal.executable 
+    case 0 {
+      %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable
+      scf.yield %executable : !hal.executable
+    }
+    default {
+      %c14_i32 = arith.constant 14 : i32
+      util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+      %2 = util.null : !hal.executable
+      scf.yield %2 : !hal.executable
+    }
+    util.global.store %1, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+    util.return
+  }
+  hal.executable private @multiple_results_dispatch_0 {
+    hal.executable.variant public @embedded_elf_arm_64 target(#executable_target_embedded_elf_arm_64) {
+      hal.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 ordinal(0) layout(#pipeline_layout) attributes {workgroup_size = [1 : index, 1 : index, 1 : index]}
+      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", llvm.target_triple = "arm64-unknown-unknown-eabi-elf"} {
+        llvm.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}) -> i32 {
+          %0 = llvm.mlir.constant(0 : i32) : i32
+          %1 = llvm.mlir.constant(8 : i64) : i64
+          %2 = llvm.mlir.constant(32 : i64) : i64
+          %3 = llvm.mlir.constant(64 : index) : i64
+          %4 = llvm.mlir.constant(true) : i1
+          %5 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
+          %6 = llvm.extractvalue %5[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+          %7 = llvm.load %6 : !llvm.ptr -> i32
+          %8 = llvm.zext %7 : i32 to i64
+          %9 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+          %10 = llvm.load %9 : !llvm.ptr -> !llvm.ptr
+          llvm.intr.assume %4 ["align"(%10, %3 : !llvm.ptr, i64)] : i1
+          %11 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
+          %12 = llvm.extractvalue %11[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+          %13 = llvm.getelementptr %12[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
+          %14 = llvm.load %13 : !llvm.ptr -> !llvm.ptr
+          %15 = llvm.mul %8, %1 : i64
+          %16 = llvm.udiv %15, %2 : i64
+          %17 = llvm.getelementptr %14[%16] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+          llvm.intr.assume %4 ["align"(%17, %3 : !llvm.ptr, i64)] : i1
+          %18 = llvm.load %10 {alignment = 4 : i64} : !llvm.ptr -> vector<2xf32>
+          %19 = llvm.intr.fabs(%18) : (vector<2xf32>) -> vector<2xf32>
+          llvm.store %19, %17 {alignment = 4 : i64} : vector<2xf32>, !llvm.ptr
+          llvm.return %0 : i32
+        }
+      }
+    }
+  }
+  util.func private @__multiple_results_memoize_apply(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} {
+    %c3 = arith.constant 3 : index
+    %c0_i32 = arith.constant 0 : i32
+    %c0 = arith.constant 0 : index
+    %c2 = arith.constant 2 : index
+    %c8 = arith.constant 8 : index
+    %c128 = arith.constant 128 : index
+    %c64_i32 = arith.constant 64 : i32
+    %c1 = arith.constant 1 : index
+    %cmd = hal.command_buffer.create device(%arg0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%arg1) bindings(%c3) : !hal.command_buffer
+    %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+    %c0_0 = arith.constant 0 : index
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0_0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([
+      (%c0 : index)[%c0, %c8], 
+      (%c2 : index)[%c0, %c128]
+    ]) flags("None")
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0_0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([
+      (%c1 : index)[%c0, %c8], 
+      (%c2 : index)[%c0, %c128]
+    ]) flags("None")
+    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
+    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
+    util.return %cmd : !hal.command_buffer
+  }
+  util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  util.initializer {
+    %c-1_i64 = arith.constant -1 : i64
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %0 = util.call @__multiple_results_memoize_apply(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer
+    util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    util.return
+  }
+  util.func private @__multiple_results_memoize_lookup(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer {
+    %0 = util.null : !hal.command_buffer
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %1 = util.cmp.eq %arg0, %__device_0 : !hal.device
+    %2 = scf.if %1 -> (!hal.command_buffer) {
+      %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+      scf.yield %__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    } else {
+      scf.yield %0 : !hal.command_buffer
+    }
+    util.return %2 : !hal.command_buffer
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c2 = arith.constant 2 : index
+    %c8 = arith.constant 8 : index
+    %c64 = arith.constant 64 : index
+    %c128 = arith.constant 128 : index
+    %c0 = arith.constant 0 : index
+    %c-1_i64 = arith.constant -1 : i64
+    %0 = util.null : !hal.fence
+    %c0_i64 = arith.constant 0 : i64
+    %c-1_i32 = arith.constant -1 : i32
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
+    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
+    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
+    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    %memory_types, %buffer_usage = hal.allocator.resolve_memory_properties for(#hal.device.affinity<@__device_0>) lifetime(external) : i32, i32
+    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_types) usage(%buffer_usage) flags("None") : !hal.buffer{%c128}
+    %1 = util.call @__multiple_results_memoize_lookup(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer
+    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+    hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%1) bindings([
+      (%buffer : !hal.buffer)[%c0, %c8], 
+      (%buffer_0 : !hal.buffer)[%c0, %c8], 
+      (%transient_buffer : !hal.buffer)[%c0, %c128]
+    ]) flags("None")
+    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32
+    util.status.check_ok %status, "failed to wait on timepoint"
+    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+    %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+    util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After ResolveTopologyQueriesPass (iree-hal-resolve-topology-queries) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module {
+  util.global private @__device_0 = #device_target_local
+  util.global private @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  util.initializer {
+    %__device_0 = util.global.load @__device_0 : !hal.device
+    %c-1_i64 = arith.constant -1 : i64
+    %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+    %c-1 = arith.constant -1 : index
+    %c0 = arith.constant 0 : index
+    %0 = arith.select %value, %c0, %c-1 : index
+    %1 = scf.index_switch %0 -> !hal.executable 
+    case 0 {
+      %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable
+      scf.yield %executable : !hal.executable
+    }
+    default {
+      %c14_i32 = arith.constant 14 : i32
+      util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+      %2 = util.null : !hal.executable
+      scf.yield %2 : !hal.executable
+    }
+    util.global.store %1, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+    util.return
+  }
+  hal.executable private @multiple_results_dispatch_0 {
+    hal.executable.variant public @embedded_elf_arm_64 target(#executable_target_embedded_elf_arm_64) {
+      hal.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 ordinal(0) layout(#pipeline_layout) attributes {workgroup_size = [1 : index, 1 : index, 1 : index]}
+      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", llvm.target_triple = "arm64-unknown-unknown-eabi-elf"} {
+        llvm.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}) -> i32 {
+          %0 = llvm.mlir.constant(0 : i32) : i32
+          %1 = llvm.mlir.constant(8 : i64) : i64
+          %2 = llvm.mlir.constant(32 : i64) : i64
+          %3 = llvm.mlir.constant(64 : index) : i64
+          %4 = llvm.mlir.constant(true) : i1
+          %5 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
+          %6 = llvm.extractvalue %5[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+          %7 = llvm.load %6 : !llvm.ptr -> i32
+          %8 = llvm.zext %7 : i32 to i64
+          %9 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+          %10 = llvm.load %9 : !llvm.ptr -> !llvm.ptr
+          llvm.intr.assume %4 ["align"(%10, %3 : !llvm.ptr, i64)] : i1
+          %11 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
+          %12 = llvm.extractvalue %11[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+          %13 = llvm.getelementptr %12[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
+          %14 = llvm.load %13 : !llvm.ptr -> !llvm.ptr
+          %15 = llvm.mul %8, %1 : i64
+          %16 = llvm.udiv %15, %2 : i64
+          %17 = llvm.getelementptr %14[%16] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+          llvm.intr.assume %4 ["align"(%17, %3 : !llvm.ptr, i64)] : i1
+          %18 = llvm.load %10 {alignment = 4 : i64} : !llvm.ptr -> vector<2xf32>
+          %19 = llvm.intr.fabs(%18) : (vector<2xf32>) -> vector<2xf32>
+          llvm.store %19, %17 {alignment = 4 : i64} : vector<2xf32>, !llvm.ptr
+          llvm.return %0 : i32
+        }
+      }
+    }
+  }
+  util.func private @__multiple_results_memoize_apply(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} {
+    %c3 = arith.constant 3 : index
+    %c0_i32 = arith.constant 0 : i32
+    %c0 = arith.constant 0 : index
+    %c2 = arith.constant 2 : index
+    %c8 = arith.constant 8 : index
+    %c128 = arith.constant 128 : index
+    %c64_i32 = arith.constant 64 : i32
+    %c1 = arith.constant 1 : index
+    %cmd = hal.command_buffer.create device(%arg0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%arg1) bindings(%c3) : !hal.command_buffer
+    %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+    %c0_0 = arith.constant 0 : index
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0_0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([
+      (%c0 : index)[%c0, %c8], 
+      (%c2 : index)[%c0, %c128]
+    ]) flags("None")
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0_0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([
+      (%c1 : index)[%c0, %c8], 
+      (%c2 : index)[%c0, %c128]
+    ]) flags("None")
+    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
+    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
+    util.return %cmd : !hal.command_buffer
+  }
+  util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  util.initializer {
+    %c-1_i64 = arith.constant -1 : i64
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %0 = util.call @__multiple_results_memoize_apply(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer
+    util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    util.return
+  }
+  util.func private @__multiple_results_memoize_lookup(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer {
+    %0 = util.null : !hal.command_buffer
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %1 = util.cmp.eq %arg0, %__device_0 : !hal.device
+    %2 = scf.if %1 -> (!hal.command_buffer) {
+      %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+      scf.yield %__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    } else {
+      scf.yield %0 : !hal.command_buffer
+    }
+    util.return %2 : !hal.command_buffer
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c2 = arith.constant 2 : index
+    %c8 = arith.constant 8 : index
+    %c64 = arith.constant 64 : index
+    %c128 = arith.constant 128 : index
+    %c0 = arith.constant 0 : index
+    %c-1_i64 = arith.constant -1 : i64
+    %0 = util.null : !hal.fence
+    %c0_i64 = arith.constant 0 : i64
+    %c-1_i32 = arith.constant -1 : i32
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
+    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
+    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
+    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32
+    %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32
+    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128}
+    %1 = util.call @__multiple_results_memoize_lookup(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer
+    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+    hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%1) bindings([
+      (%buffer : !hal.buffer)[%c0, %c8], 
+      (%buffer_0 : !hal.buffer)[%c0, %c8], 
+      (%transient_buffer : !hal.buffer)[%c0, %c128]
+    ]) flags("None")
+    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32
+    util.status.check_ok %status, "failed to wait on timepoint"
+    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+    %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+    util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After MemoizeDeviceSelectionPass (iree-hal-memoize-device-selection) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module {
+  util.global private @__device_0 = #device_target_local
+  util.global private @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  util.initializer {
+    %__device_0 = util.global.load @__device_0 : !hal.device
+    %c-1_i64 = arith.constant -1 : i64
+    %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+    %c-1 = arith.constant -1 : index
+    %c0 = arith.constant 0 : index
+    %0 = arith.select %value, %c0, %c-1 : index
+    %1 = scf.index_switch %0 -> !hal.executable 
+    case 0 {
+      %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable
+      scf.yield %executable : !hal.executable
+    }
+    default {
+      %c14_i32 = arith.constant 14 : i32
+      util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+      %2 = util.null : !hal.executable
+      scf.yield %2 : !hal.executable
+    }
+    util.global.store %1, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+    util.return
+  }
+  hal.executable private @multiple_results_dispatch_0 {
+    hal.executable.variant public @embedded_elf_arm_64 target(#executable_target_embedded_elf_arm_64) {
+      hal.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 ordinal(0) layout(#pipeline_layout) attributes {workgroup_size = [1 : index, 1 : index, 1 : index]}
+      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", llvm.target_triple = "arm64-unknown-unknown-eabi-elf"} {
+        llvm.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}) -> i32 {
+          %0 = llvm.mlir.constant(0 : i32) : i32
+          %1 = llvm.mlir.constant(8 : i64) : i64
+          %2 = llvm.mlir.constant(32 : i64) : i64
+          %3 = llvm.mlir.constant(64 : index) : i64
+          %4 = llvm.mlir.constant(true) : i1
+          %5 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
+          %6 = llvm.extractvalue %5[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+          %7 = llvm.load %6 : !llvm.ptr -> i32
+          %8 = llvm.zext %7 : i32 to i64
+          %9 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+          %10 = llvm.load %9 : !llvm.ptr -> !llvm.ptr
+          llvm.intr.assume %4 ["align"(%10, %3 : !llvm.ptr, i64)] : i1
+          %11 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
+          %12 = llvm.extractvalue %11[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+          %13 = llvm.getelementptr %12[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
+          %14 = llvm.load %13 : !llvm.ptr -> !llvm.ptr
+          %15 = llvm.mul %8, %1 : i64
+          %16 = llvm.udiv %15, %2 : i64
+          %17 = llvm.getelementptr %14[%16] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+          llvm.intr.assume %4 ["align"(%17, %3 : !llvm.ptr, i64)] : i1
+          %18 = llvm.load %10 {alignment = 4 : i64} : !llvm.ptr -> vector<2xf32>
+          %19 = llvm.intr.fabs(%18) : (vector<2xf32>) -> vector<2xf32>
+          llvm.store %19, %17 {alignment = 4 : i64} : vector<2xf32>, !llvm.ptr
+          llvm.return %0 : i32
+        }
+      }
+    }
+  }
+  util.func private @__multiple_results_memoize_apply(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} {
+    %c3 = arith.constant 3 : index
+    %c0_i32 = arith.constant 0 : i32
+    %c0 = arith.constant 0 : index
+    %c2 = arith.constant 2 : index
+    %c8 = arith.constant 8 : index
+    %c128 = arith.constant 128 : index
+    %c64_i32 = arith.constant 64 : i32
+    %c1 = arith.constant 1 : index
+    %cmd = hal.command_buffer.create device(%arg0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%arg1) bindings(%c3) : !hal.command_buffer
+    %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+    %c0_0 = arith.constant 0 : index
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0_0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([
+      (%c0 : index)[%c0, %c8], 
+      (%c2 : index)[%c0, %c128]
+    ]) flags("None")
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0_0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([
+      (%c1 : index)[%c0, %c8], 
+      (%c2 : index)[%c0, %c128]
+    ]) flags("None")
+    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
+    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
+    util.return %cmd : !hal.command_buffer
+  }
+  util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  util.initializer {
+    %c-1_i64 = arith.constant -1 : i64
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %0 = util.call @__multiple_results_memoize_apply(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer
+    util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    util.return
+  }
+  util.func private @__multiple_results_memoize_lookup(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer {
+    %0 = util.null : !hal.command_buffer
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %1 = util.cmp.eq %arg0, %__device_0 : !hal.device
+    %2 = scf.if %1 -> (!hal.command_buffer) {
+      %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+      scf.yield %__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    } else {
+      scf.yield %0 : !hal.command_buffer
+    }
+    util.return %2 : !hal.command_buffer
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c2 = arith.constant 2 : index
+    %c8 = arith.constant 8 : index
+    %c64 = arith.constant 64 : index
+    %c128 = arith.constant 128 : index
+    %c0 = arith.constant 0 : index
+    %c-1_i64 = arith.constant -1 : i64
+    %0 = util.null : !hal.fence
+    %c0_i64 = arith.constant 0 : i64
+    %c-1_i32 = arith.constant -1 : i32
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
+    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
+    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
+    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32
+    %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32
+    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128}
+    %1 = util.call @__multiple_results_memoize_lookup(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer
+    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+    hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%1) bindings([
+      (%buffer : !hal.buffer)[%c0, %c8], 
+      (%buffer_0 : !hal.buffer)[%c0, %c8], 
+      (%transient_buffer : !hal.buffer)[%c0, %c128]
+    ]) flags("None")
+    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32
+    util.status.check_ok %status, "failed to wait on timepoint"
+    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+    %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+    util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After MemoizeDeviceQueriesPass (iree-hal-memoize-device-queries) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module {
+  util.global private @__device_0 = #device_target_local
+  util.global private @__device_0_query_0_hal_executable_format_embedded_elf_arm_64_ok : i1
+  util.global private @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  util.initializer {
+    %__device_0 = util.global.load @__device_0 : !hal.device
+    %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+    util.global.store %ok, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64_ok : i1
+    util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+    util.return
+  }
+  util.global private @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  util.initializer {
+    %__device_0 = util.global.load @__device_0 : !hal.device
+    %c-1_i64 = arith.constant -1 : i64
+    %__device_0_query_0_hal_executable_format_embedded_elf_arm_64_ok = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64_ok : i1
+    %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+    %c-1 = arith.constant -1 : index
+    %c0 = arith.constant 0 : index
+    %0 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index
+    %1 = scf.index_switch %0 -> !hal.executable 
+    case 0 {
+      %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable
+      scf.yield %executable : !hal.executable
+    }
+    default {
+      %c14_i32 = arith.constant 14 : i32
+      util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+      %2 = util.null : !hal.executable
+      scf.yield %2 : !hal.executable
+    }
+    util.global.store %1, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+    util.return
+  }
+  hal.executable private @multiple_results_dispatch_0 {
+    hal.executable.variant public @embedded_elf_arm_64 target(#executable_target_embedded_elf_arm_64) {
+      hal.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 ordinal(0) layout(#pipeline_layout) attributes {workgroup_size = [1 : index, 1 : index, 1 : index]}
+      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", llvm.target_triple = "arm64-unknown-unknown-eabi-elf"} {
+        llvm.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}) -> i32 {
+          %0 = llvm.mlir.constant(0 : i32) : i32
+          %1 = llvm.mlir.constant(8 : i64) : i64
+          %2 = llvm.mlir.constant(32 : i64) : i64
+          %3 = llvm.mlir.constant(64 : index) : i64
+          %4 = llvm.mlir.constant(true) : i1
+          %5 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
+          %6 = llvm.extractvalue %5[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+          %7 = llvm.load %6 : !llvm.ptr -> i32
+          %8 = llvm.zext %7 : i32 to i64
+          %9 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+          %10 = llvm.load %9 : !llvm.ptr -> !llvm.ptr
+          llvm.intr.assume %4 ["align"(%10, %3 : !llvm.ptr, i64)] : i1
+          %11 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
+          %12 = llvm.extractvalue %11[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+          %13 = llvm.getelementptr %12[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
+          %14 = llvm.load %13 : !llvm.ptr -> !llvm.ptr
+          %15 = llvm.mul %8, %1 : i64
+          %16 = llvm.udiv %15, %2 : i64
+          %17 = llvm.getelementptr %14[%16] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+          llvm.intr.assume %4 ["align"(%17, %3 : !llvm.ptr, i64)] : i1
+          %18 = llvm.load %10 {alignment = 4 : i64} : !llvm.ptr -> vector<2xf32>
+          %19 = llvm.intr.fabs(%18) : (vector<2xf32>) -> vector<2xf32>
+          llvm.store %19, %17 {alignment = 4 : i64} : vector<2xf32>, !llvm.ptr
+          llvm.return %0 : i32
+        }
+      }
+    }
+  }
+  util.func private @__multiple_results_memoize_apply(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} {
+    %c3 = arith.constant 3 : index
+    %c0_i32 = arith.constant 0 : i32
+    %c0 = arith.constant 0 : index
+    %c2 = arith.constant 2 : index
+    %c8 = arith.constant 8 : index
+    %c128 = arith.constant 128 : index
+    %c64_i32 = arith.constant 64 : i32
+    %c1 = arith.constant 1 : index
+    %cmd = hal.command_buffer.create device(%arg0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%arg1) bindings(%c3) : !hal.command_buffer
+    %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+    %c0_0 = arith.constant 0 : index
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0_0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([
+      (%c0 : index)[%c0, %c8], 
+      (%c2 : index)[%c0, %c128]
+    ]) flags("None")
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0_0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([
+      (%c1 : index)[%c0, %c8], 
+      (%c2 : index)[%c0, %c128]
+    ]) flags("None")
+    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
+    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
+    util.return %cmd : !hal.command_buffer
+  }
+  util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  util.initializer {
+    %c-1_i64 = arith.constant -1 : i64
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %0 = util.call @__multiple_results_memoize_apply(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer
+    util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    util.return
+  }
+  util.func private @__multiple_results_memoize_lookup(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer {
+    %0 = util.null : !hal.command_buffer
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %1 = util.cmp.eq %arg0, %__device_0 : !hal.device
+    %2 = scf.if %1 -> (!hal.command_buffer) {
+      %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+      scf.yield %__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    } else {
+      scf.yield %0 : !hal.command_buffer
+    }
+    util.return %2 : !hal.command_buffer
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c2 = arith.constant 2 : index
+    %c8 = arith.constant 8 : index
+    %c64 = arith.constant 64 : index
+    %c128 = arith.constant 128 : index
+    %c0 = arith.constant 0 : index
+    %c-1_i64 = arith.constant -1 : i64
+    %0 = util.null : !hal.fence
+    %c0_i64 = arith.constant 0 : i64
+    %c-1_i32 = arith.constant -1 : i32
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
+    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
+    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
+    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32
+    %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32
+    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128}
+    %1 = util.call @__multiple_results_memoize_lookup(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer
+    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+    hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%1) bindings([
+      (%buffer : !hal.buffer)[%c0, %c8], 
+      (%buffer_0 : !hal.buffer)[%c0, %c8], 
+      (%transient_buffer : !hal.buffer)[%c0, %c128]
+    ]) flags("None")
+    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32
+    util.status.check_ok %status, "failed to wait on timepoint"
+    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+    %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+    util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+util.initializer {
+  %c-1_i64 = arith.constant -1 : i64
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %0 = util.call @__multiple_results_memoize_apply(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer
+  util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  util.return
+}
+
+// -----// IR Dump After CSE (cse) //----- //
+util.initializer {
+  %c-1_i64 = arith.constant -1 : i64
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %0 = util.call @__multiple_results_memoize_apply(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer
+  util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  util.return
+}
+
+// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- //
+util.initializer {
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %c-1_i64 = arith.constant -1 : i64
+  %0 = util.call @__multiple_results_memoize_apply(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer
+  util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  util.return
+}
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32
+  %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32
+  %c2 = arith.constant 2 : index
+  %c8 = arith.constant 8 : index
+  %c64 = arith.constant 64 : index
+  %c128 = arith.constant 128 : index
+  %c0 = arith.constant 0 : index
+  %c-1_i64 = arith.constant -1 : i64
+  %0 = util.null : !hal.fence
+  %c0_i64 = arith.constant 0 : i64
+  %c-1_i32 = arith.constant -1 : i32
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
+  %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
+  hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
+  hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+  %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+  %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128}
+  %1 = util.call @__multiple_results_memoize_lookup(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer
+  %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+  hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%1) bindings([
+    (%buffer : !hal.buffer)[%c0, %c8], 
+    (%buffer_0 : !hal.buffer)[%c0, %c8], 
+    (%transient_buffer : !hal.buffer)[%c0, %c128]
+  ]) flags("None")
+  %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32
+  util.status.check_ok %status, "failed to wait on timepoint"
+  %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+  %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+  util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+util.initializer {
+  %0 = util.null : !hal.executable
+  %c14_i32 = arith.constant 14 : i32
+  %c0 = arith.constant 0 : index
+  %c-1 = arith.constant -1 : index
+  %c-1_i64 = arith.constant -1 : i64
+  %__device_0 = util.global.load @__device_0 : !hal.device
+  %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index
+  %2 = scf.index_switch %1 -> !hal.executable 
+  case 0 {
+    %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable
+    scf.yield %executable : !hal.executable
+  }
+  default {
+    util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+    scf.yield %0 : !hal.executable
+  }
+  util.global.store %2, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  util.return
+}
+
+// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- //
+util.initializer {
+  %c-1_i64 = arith.constant -1 : i64
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %0 = util.call @__multiple_results_memoize_apply(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer
+  util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  util.return
+}
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+util.func private @__multiple_results_memoize_apply(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} {
+  %c3 = arith.constant 3 : index
+  %c0_i32 = arith.constant 0 : i32
+  %c0 = arith.constant 0 : index
+  %c2 = arith.constant 2 : index
+  %c8 = arith.constant 8 : index
+  %c128 = arith.constant 128 : index
+  %c64_i32 = arith.constant 64 : i32
+  %c1 = arith.constant 1 : index
+  %cmd = hal.command_buffer.create device(%arg0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%arg1) bindings(%c3) : !hal.command_buffer
+  %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([
+    (%c0 : index)[%c0, %c8], 
+    (%c2 : index)[%c0, %c128]
+  ]) flags("None")
+  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([
+    (%c1 : index)[%c0, %c8], 
+    (%c2 : index)[%c0, %c128]
+  ]) flags("None")
+  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
+  hal.command_buffer.finalize<%cmd : !hal.command_buffer>
+  util.return %cmd : !hal.command_buffer
+}
+
+// -----// IR Dump After CSE (cse) //----- //
+util.func private @__multiple_results_memoize_apply(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} {
+  %c3 = arith.constant 3 : index
+  %c0_i32 = arith.constant 0 : i32
+  %c0 = arith.constant 0 : index
+  %c2 = arith.constant 2 : index
+  %c8 = arith.constant 8 : index
+  %c128 = arith.constant 128 : index
+  %c64_i32 = arith.constant 64 : i32
+  %c1 = arith.constant 1 : index
+  %cmd = hal.command_buffer.create device(%arg0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%arg1) bindings(%c3) : !hal.command_buffer
+  %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([
+    (%c0 : index)[%c0, %c8], 
+    (%c2 : index)[%c0, %c128]
+  ]) flags("None")
+  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([
+    (%c1 : index)[%c0, %c8], 
+    (%c2 : index)[%c0, %c128]
+  ]) flags("None")
+  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
+  hal.command_buffer.finalize<%cmd : !hal.command_buffer>
+  util.return %cmd : !hal.command_buffer
+}
+
+// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- //
+util.func private @__multiple_results_memoize_apply(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} {
+  %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  %c3 = arith.constant 3 : index
+  %c0_i32 = arith.constant 0 : i32
+  %c0 = arith.constant 0 : index
+  %c2 = arith.constant 2 : index
+  %c8 = arith.constant 8 : index
+  %c128 = arith.constant 128 : index
+  %c64_i32 = arith.constant 64 : i32
+  %c1 = arith.constant 1 : index
+  %cmd = hal.command_buffer.create device(%arg0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%arg1) bindings(%c3) : !hal.command_buffer
+  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([
+    (%c0 : index)[%c0, %c8], 
+    (%c2 : index)[%c0, %c128]
+  ]) flags("None")
+  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([
+    (%c1 : index)[%c0, %c8], 
+    (%c2 : index)[%c0, %c128]
+  ]) flags("None")
+  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
+  hal.command_buffer.finalize<%cmd : !hal.command_buffer>
+  util.return %cmd : !hal.command_buffer
+}
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+util.initializer {
+  %__device_0 = util.global.load @__device_0 : !hal.device
+  %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+  util.global.store %ok, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64_ok : i1
+  util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  util.return
+}
+
+// -----// IR Dump After CSE (cse) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32
+  %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32
+  %c2 = arith.constant 2 : index
+  %c8 = arith.constant 8 : index
+  %c64 = arith.constant 64 : index
+  %c128 = arith.constant 128 : index
+  %c0 = arith.constant 0 : index
+  %c-1_i64 = arith.constant -1 : i64
+  %0 = util.null : !hal.fence
+  %c0_i64 = arith.constant 0 : i64
+  %c-1_i32 = arith.constant -1 : i32
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
+  %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
+  hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
+  hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+  %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+  %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128}
+  %1 = util.call @__multiple_results_memoize_lookup(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer
+  %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+  hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%1) bindings([
+    (%buffer : !hal.buffer)[%c0, %c8], 
+    (%buffer_0 : !hal.buffer)[%c0, %c8], 
+    (%transient_buffer : !hal.buffer)[%c0, %c128]
+  ]) flags("None")
+  %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32
+  util.status.check_ok %status, "failed to wait on timepoint"
+  %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+  %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+  util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After CSE (cse) //----- //
+util.initializer {
+  %0 = util.null : !hal.executable
+  %c14_i32 = arith.constant 14 : i32
+  %c0 = arith.constant 0 : index
+  %c-1 = arith.constant -1 : index
+  %c-1_i64 = arith.constant -1 : i64
+  %__device_0 = util.global.load @__device_0 : !hal.device
+  %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index
+  %2 = scf.index_switch %1 -> !hal.executable 
+  case 0 {
+    %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable
+    scf.yield %executable : !hal.executable
+  }
+  default {
+    util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+    scf.yield %0 : !hal.executable
+  }
+  util.global.store %2, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  util.return
+}
+
+// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- //
+util.func private @__multiple_results_memoize_apply(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} {
+  %c1 = arith.constant 1 : index
+  %c64_i32 = arith.constant 64 : i32
+  %c128 = arith.constant 128 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %c0 = arith.constant 0 : index
+  %c0_i32 = arith.constant 0 : i32
+  %c3 = arith.constant 3 : index
+  %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  %cmd = hal.command_buffer.create device(%arg0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%arg1) bindings(%c3) : !hal.command_buffer
+  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([
+    (%c0 : index)[%c0, %c8], 
+    (%c2 : index)[%c0, %c128]
+  ]) flags("None")
+  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([
+    (%c1 : index)[%c0, %c8], 
+    (%c2 : index)[%c0, %c128]
+  ]) flags("None")
+  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
+  hal.command_buffer.finalize<%cmd : !hal.command_buffer>
+  util.return %cmd : !hal.command_buffer
+}
+
+// -----// IR Dump After CSE (cse) //----- //
+util.initializer {
+  %__device_0 = util.global.load @__device_0 : !hal.device
+  %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+  util.global.store %ok, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64_ok : i1
+  util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  util.return
+}
+
+// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- //
+util.initializer {
+  %__device_0 = util.global.load @__device_0 : !hal.device
+  %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+  util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  util.global.store %ok, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64_ok : i1
+  util.return
+}
+
+// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32
+  %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32
+  %c2 = arith.constant 2 : index
+  %c8 = arith.constant 8 : index
+  %c64 = arith.constant 64 : index
+  %c128 = arith.constant 128 : index
+  %c0 = arith.constant 0 : index
+  %c-1_i64 = arith.constant -1 : i64
+  %0 = util.null : !hal.fence
+  %c0_i64 = arith.constant 0 : i64
+  %c-1_i32 = arith.constant -1 : i32
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
+  %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
+  hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
+  hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+  %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+  %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128}
+  %1 = util.call @__multiple_results_memoize_lookup(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer
+  %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+  hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%1) bindings([
+    (%buffer : !hal.buffer)[%c0, %c8], 
+    (%buffer_0 : !hal.buffer)[%c0, %c8], 
+    (%transient_buffer : !hal.buffer)[%c0, %c128]
+  ]) flags("None")
+  %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32
+  util.status.check_ok %status, "failed to wait on timepoint"
+  %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+  %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+  util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- //
+util.initializer {
+  %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  %__device_0 = util.global.load @__device_0 : !hal.device
+  %0 = util.null : !hal.executable
+  %c14_i32 = arith.constant 14 : i32
+  %c0 = arith.constant 0 : index
+  %c-1 = arith.constant -1 : index
+  %c-1_i64 = arith.constant -1 : i64
+  %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index
+  %2 = scf.index_switch %1 -> !hal.executable 
+  case 0 {
+    %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable
+    scf.yield %executable : !hal.executable
+  }
+  default {
+    util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+    scf.yield %0 : !hal.executable
+  }
+  util.global.store %2, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  util.return
+}
+
+// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- //
+util.initializer {
+  %__device_0 = util.global.load @__device_0 : !hal.device
+  %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+  util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  util.global.store %ok, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64_ok : i1
+  util.return
+}
+
+// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c-1_i32 = arith.constant -1 : i32
+  %c0_i64 = arith.constant 0 : i64
+  %0 = util.null : !hal.fence
+  %c-1_i64 = arith.constant -1 : i64
+  %c0 = arith.constant 0 : index
+  %c128 = arith.constant 128 : index
+  %c64 = arith.constant 64 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32
+  %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
+  %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
+  hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
+  hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+  %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+  %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128}
+  %1 = util.call @__multiple_results_memoize_lookup(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer
+  %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+  hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%1) bindings([
+    (%buffer : !hal.buffer)[%c0, %c8], 
+    (%buffer_0 : !hal.buffer)[%c0, %c8], 
+    (%transient_buffer : !hal.buffer)[%c0, %c128]
+  ]) flags("None")
+  %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32
+  util.status.check_ok %status, "failed to wait on timepoint"
+  %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+  %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+  util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+util.func private @__multiple_results_memoize_lookup(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer {
+  %0 = util.null : !hal.command_buffer
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %1 = util.cmp.eq %arg0, %__device_0 : !hal.device
+  %2 = scf.if %1 -> (!hal.command_buffer) {
+    %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    scf.yield %__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  } else {
+    scf.yield %0 : !hal.command_buffer
+  }
+  util.return %2 : !hal.command_buffer
+}
+
+// -----// IR Dump After CSE (cse) //----- //
+util.func private @__multiple_results_memoize_lookup(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer {
+  %0 = util.null : !hal.command_buffer
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %1 = util.cmp.eq %arg0, %__device_0 : !hal.device
+  %2 = scf.if %1 -> (!hal.command_buffer) {
+    %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    scf.yield %__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  } else {
+    scf.yield %0 : !hal.command_buffer
+  }
+  util.return %2 : !hal.command_buffer
+}
+
+// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- //
+util.initializer {
+  %c-1_i64 = arith.constant -1 : i64
+  %c-1 = arith.constant -1 : index
+  %c0 = arith.constant 0 : index
+  %c14_i32 = arith.constant 14 : i32
+  %0 = util.null : !hal.executable
+  %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  %__device_0 = util.global.load @__device_0 : !hal.device
+  %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index
+  %2 = arith.cmpi eq, %1, %c0 : index
+  %3 = scf.if %2 -> (!hal.executable) {
+    %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable
+    scf.yield %executable : !hal.executable
+  } else {
+    util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+    scf.yield %0 : !hal.executable
+  }
+  util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  util.return
+}
+
+// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- //
+util.func private @__multiple_results_memoize_lookup(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer {
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %0 = util.null : !hal.command_buffer
+  %1 = util.cmp.eq %arg0, %__device_0 : !hal.device
+  %2 = scf.if %1 -> (!hal.command_buffer) {
+    %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    scf.yield %__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  } else {
+    scf.yield %0 : !hal.command_buffer
+  }
+  util.return %2 : !hal.command_buffer
+}
+
+// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- //
+util.func private @__multiple_results_memoize_lookup(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer {
+  %0 = util.null : !hal.command_buffer
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %1 = util.cmp.eq %arg0, %__device_0 : !hal.device
+  %2 = scf.if %1 -> (!hal.command_buffer) {
+    %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    scf.yield %__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  } else {
+    scf.yield %0 : !hal.command_buffer
+  }
+  util.return %2 : !hal.command_buffer
+}
+
+// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module {
+  util.global private @__device_0 = #device_target_local
+  util.global private @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  util.initializer {
+    %__device_0 = util.global.load @__device_0 : !hal.device
+    %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+    util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+    util.return
+  }
+  util.global private @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  util.initializer {
+    %c-1_i64 = arith.constant -1 : i64
+    %c-1 = arith.constant -1 : index
+    %c0 = arith.constant 0 : index
+    %c14_i32 = arith.constant 14 : i32
+    %0 = util.null : !hal.executable
+    %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+    %__device_0 = util.global.load @__device_0 : !hal.device
+    %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index
+    %2 = arith.cmpi eq, %1, %c0 : index
+    %3 = scf.if %2 -> (!hal.executable) {
+      %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable
+      scf.yield %executable : !hal.executable
+    } else {
+      util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+      scf.yield %0 : !hal.executable
+    }
+    util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+    util.return
+  }
+  hal.executable private @multiple_results_dispatch_0 {
+    hal.executable.variant public @embedded_elf_arm_64 target(#executable_target_embedded_elf_arm_64) {
+      hal.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 ordinal(0) layout(#pipeline_layout) attributes {workgroup_size = [1 : index, 1 : index, 1 : index]}
+      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", llvm.target_triple = "arm64-unknown-unknown-eabi-elf"} {
+        llvm.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}) -> i32 {
+          %0 = llvm.mlir.constant(0 : i32) : i32
+          %1 = llvm.mlir.constant(8 : i64) : i64
+          %2 = llvm.mlir.constant(32 : i64) : i64
+          %3 = llvm.mlir.constant(64 : index) : i64
+          %4 = llvm.mlir.constant(true) : i1
+          %5 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
+          %6 = llvm.extractvalue %5[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+          %7 = llvm.load %6 : !llvm.ptr -> i32
+          %8 = llvm.zext %7 : i32 to i64
+          %9 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+          %10 = llvm.load %9 : !llvm.ptr -> !llvm.ptr
+          llvm.intr.assume %4 ["align"(%10, %3 : !llvm.ptr, i64)] : i1
+          %11 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
+          %12 = llvm.extractvalue %11[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+          %13 = llvm.getelementptr %12[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
+          %14 = llvm.load %13 : !llvm.ptr -> !llvm.ptr
+          %15 = llvm.mul %8, %1 : i64
+          %16 = llvm.udiv %15, %2 : i64
+          %17 = llvm.getelementptr %14[%16] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+          llvm.intr.assume %4 ["align"(%17, %3 : !llvm.ptr, i64)] : i1
+          %18 = llvm.load %10 {alignment = 4 : i64} : !llvm.ptr -> vector<2xf32>
+          %19 = llvm.intr.fabs(%18) : (vector<2xf32>) -> vector<2xf32>
+          llvm.store %19, %17 {alignment = 4 : i64} : vector<2xf32>, !llvm.ptr
+          llvm.return %0 : i32
+        }
+      }
+    }
+  }
+  util.func private @__multiple_results_memoize_apply(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} {
+    %c1 = arith.constant 1 : index
+    %c64_i32 = arith.constant 64 : i32
+    %c128 = arith.constant 128 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %c0 = arith.constant 0 : index
+    %c0_i32 = arith.constant 0 : i32
+    %c3 = arith.constant 3 : index
+    %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+    %cmd = hal.command_buffer.create device(%arg0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%arg1) bindings(%c3) : !hal.command_buffer
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([
+      (%c0 : index)[%c0, %c8], 
+      (%c2 : index)[%c0, %c128]
+    ]) flags("None")
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([
+      (%c1 : index)[%c0, %c8], 
+      (%c2 : index)[%c0, %c128]
+    ]) flags("None")
+    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
+    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
+    util.return %cmd : !hal.command_buffer
+  }
+  util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  util.initializer {
+    %c-1_i64 = arith.constant -1 : i64
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %0 = util.call @__multiple_results_memoize_apply(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer
+    util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    util.return
+  }
+  util.func private @__multiple_results_memoize_lookup(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer {
+    %0 = util.null : !hal.command_buffer
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %1 = util.cmp.eq %arg0, %__device_0 : !hal.device
+    %2 = scf.if %1 -> (!hal.command_buffer) {
+      %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+      scf.yield %__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    } else {
+      scf.yield %0 : !hal.command_buffer
+    }
+    util.return %2 : !hal.command_buffer
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c-1_i32 = arith.constant -1 : i32
+    %c0_i64 = arith.constant 0 : i64
+    %0 = util.null : !hal.fence
+    %c-1_i64 = arith.constant -1 : i64
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c64 = arith.constant 64 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32
+    %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
+    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
+    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
+    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128}
+    %1 = util.call @__multiple_results_memoize_lookup(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer
+    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+    hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%1) bindings([
+      (%buffer : !hal.buffer)[%c0, %c8], 
+      (%buffer_0 : !hal.buffer)[%c0, %c8], 
+      (%transient_buffer : !hal.buffer)[%c0, %c128]
+    ]) flags("None")
+    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32
+    util.status.check_ok %status, "failed to wait on timepoint"
+    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+    %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+    util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
+#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device
+module {
+  util.global private @__device_0 = #device_target_local
+  util.global private @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  util.initializer {
+    %__device_0 = util.global.load @__device_0 : !hal.device
+    %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+    util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+    util.return
+  }
+  util.global private @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  util.initializer {
+    %c-1_i64 = arith.constant -1 : i64
+    %c-1 = arith.constant -1 : index
+    %c0 = arith.constant 0 : index
+    %c14_i32 = arith.constant 14 : i32
+    %0 = util.null : !hal.executable
+    %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+    %__device_0 = util.global.load @__device_0 : !hal.device
+    %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index
+    %2 = arith.cmpi eq, %1, %c0 : index
+    %3 = scf.if %2 -> (!hal.executable) {
+      %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable
+      scf.yield %executable : !hal.executable
+    } else {
+      util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+      scf.yield %0 : !hal.executable
+    }
+    util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+    util.return
+  }
+  hal.executable private @multiple_results_dispatch_0 {
+    hal.executable.variant public @embedded_elf_arm_64 target(#executable_target_embedded_elf_arm_64) {
+      hal.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 ordinal(0) layout(#pipeline_layout) attributes {workgroup_size = [1 : index, 1 : index, 1 : index]}
+      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", llvm.target_triple = "arm64-unknown-unknown-eabi-elf"} {
+        llvm.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}) -> i32 {
+          %0 = llvm.mlir.constant(0 : i32) : i32
+          %1 = llvm.mlir.constant(8 : i64) : i64
+          %2 = llvm.mlir.constant(32 : i64) : i64
+          %3 = llvm.mlir.constant(64 : index) : i64
+          %4 = llvm.mlir.constant(true) : i1
+          %5 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
+          %6 = llvm.extractvalue %5[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+          %7 = llvm.load %6 : !llvm.ptr -> i32
+          %8 = llvm.zext %7 : i32 to i64
+          %9 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+          %10 = llvm.load %9 : !llvm.ptr -> !llvm.ptr
+          llvm.intr.assume %4 ["align"(%10, %3 : !llvm.ptr, i64)] : i1
+          %11 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
+          %12 = llvm.extractvalue %11[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+          %13 = llvm.getelementptr %12[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
+          %14 = llvm.load %13 : !llvm.ptr -> !llvm.ptr
+          %15 = llvm.mul %8, %1 : i64
+          %16 = llvm.udiv %15, %2 : i64
+          %17 = llvm.getelementptr %14[%16] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+          llvm.intr.assume %4 ["align"(%17, %3 : !llvm.ptr, i64)] : i1
+          %18 = llvm.load %10 {alignment = 4 : i64} : !llvm.ptr -> vector<2xf32>
+          %19 = llvm.intr.fabs(%18) : (vector<2xf32>) -> vector<2xf32>
+          llvm.store %19, %17 {alignment = 4 : i64} : vector<2xf32>, !llvm.ptr
+          llvm.return %0 : i32
+        }
+      }
+    }
+  }
+  util.func private @__multiple_results_memoize_apply(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} {
+    %c1 = arith.constant 1 : index
+    %c64_i32 = arith.constant 64 : i32
+    %c128 = arith.constant 128 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %c0 = arith.constant 0 : index
+    %c0_i32 = arith.constant 0 : i32
+    %c3 = arith.constant 3 : index
+    %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+    %cmd = hal.command_buffer.create device(%arg0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%arg1) bindings(%c3) : !hal.command_buffer
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([
+      (%c0 : index)[%c0, %c8], 
+      (%c2 : index)[%c0, %c128]
+    ]) flags("None")
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([
+      (%c1 : index)[%c0, %c8], 
+      (%c2 : index)[%c0, %c128]
+    ]) flags("None")
+    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
+    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
+    util.return %cmd : !hal.command_buffer
+  }
+  util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  util.initializer {
+    %c-1_i64 = arith.constant -1 : i64
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %0 = util.call @__multiple_results_memoize_apply(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer
+    util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    util.return
+  }
+  util.func private @__multiple_results_memoize_lookup(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer {
+    %0 = util.null : !hal.command_buffer
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %1 = util.cmp.eq %arg0, %__device_0 : !hal.device
+    %2 = scf.if %1 -> (!hal.command_buffer) {
+      %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+      scf.yield %__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    } else {
+      scf.yield %0 : !hal.command_buffer
+    }
+    util.return %2 : !hal.command_buffer
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c-1_i32 = arith.constant -1 : i32
+    %c0_i64 = arith.constant 0 : i64
+    %0 = util.null : !hal.fence
+    %c-1_i64 = arith.constant -1 : i64
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c64 = arith.constant 64 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32
+    %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
+    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
+    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
+    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128}
+    %1 = util.call @__multiple_results_memoize_lookup(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer
+    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+    hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%1) bindings([
+      (%buffer : !hal.buffer)[%c0, %c8], 
+      (%buffer_0 : !hal.buffer)[%c0, %c8], 
+      (%transient_buffer : !hal.buffer)[%c0, %c128]
+    ]) flags("None")
+    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32
+    util.status.check_ok %status, "failed to wait on timepoint"
+    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+    %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+    util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After ElideRedundantCommandsPass (iree-hal-elide-redundant-commands) //----- //
+util.initializer {
+  %__device_0 = util.global.load @__device_0 : !hal.device
+  %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+  util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  util.return
+}
+
+// -----// IR Dump After ElideRedundantCommandsPass (iree-hal-elide-redundant-commands) //----- //
+util.initializer {
+  %c-1_i64 = arith.constant -1 : i64
+  %c-1 = arith.constant -1 : index
+  %c0 = arith.constant 0 : index
+  %c14_i32 = arith.constant 14 : i32
+  %0 = util.null : !hal.executable
+  %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  %__device_0 = util.global.load @__device_0 : !hal.device
+  %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index
+  %2 = arith.cmpi eq, %1, %c0 : index
+  %3 = scf.if %2 -> (!hal.executable) {
+    %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable
+    scf.yield %executable : !hal.executable
+  } else {
+    util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+    scf.yield %0 : !hal.executable
+  }
+  util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  util.return
+}
+
+// -----// IR Dump After ElideRedundantCommandsPass (iree-hal-elide-redundant-commands) //----- //
+util.initializer {
+  %c-1_i64 = arith.constant -1 : i64
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %0 = util.call @__multiple_results_memoize_apply(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer
+  util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  util.return
+}
+
+// -----// IR Dump After ElideRedundantCommandsPass (iree-hal-elide-redundant-commands) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c-1_i32 = arith.constant -1 : i32
+  %c0_i64 = arith.constant 0 : i64
+  %0 = util.null : !hal.fence
+  %c-1_i64 = arith.constant -1 : i64
+  %c0 = arith.constant 0 : index
+  %c128 = arith.constant 128 : index
+  %c64 = arith.constant 64 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32
+  %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
+  %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
+  hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
+  hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+  %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+  %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128}
+  %1 = util.call @__multiple_results_memoize_lookup(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer
+  %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+  hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%1) bindings([
+    (%buffer : !hal.buffer)[%c0, %c8], 
+    (%buffer_0 : !hal.buffer)[%c0, %c8], 
+    (%transient_buffer : !hal.buffer)[%c0, %c128]
+  ]) flags("None")
+  %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32
+  util.status.check_ok %status, "failed to wait on timepoint"
+  %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+  %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+  util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After ElideRedundantCommandsPass (iree-hal-elide-redundant-commands) //----- //
+util.func private @__multiple_results_memoize_apply(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} {
+  %c1 = arith.constant 1 : index
+  %c64_i32 = arith.constant 64 : i32
+  %c128 = arith.constant 128 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %c0 = arith.constant 0 : index
+  %c0_i32 = arith.constant 0 : i32
+  %c3 = arith.constant 3 : index
+  %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  %cmd = hal.command_buffer.create device(%arg0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%arg1) bindings(%c3) : !hal.command_buffer
+  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([
+    (%c0 : index)[%c0, %c8], 
+    (%c2 : index)[%c0, %c128]
+  ]) flags("None")
+  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([
+    (%c1 : index)[%c0, %c8], 
+    (%c2 : index)[%c0, %c128]
+  ]) flags("None")
+  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
+  hal.command_buffer.finalize<%cmd : !hal.command_buffer>
+  util.return %cmd : !hal.command_buffer
+}
+
+// -----// IR Dump After ElideRedundantCommandsPass (iree-hal-elide-redundant-commands) //----- //
+util.func private @__multiple_results_memoize_lookup(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer {
+  %0 = util.null : !hal.command_buffer
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %1 = util.cmp.eq %arg0, %__device_0 : !hal.device
+  %2 = scf.if %1 -> (!hal.command_buffer) {
+    %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    scf.yield %__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  } else {
+    scf.yield %0 : !hal.command_buffer
+  }
+  util.return %2 : !hal.command_buffer
+}
+
+// -----// IR Dump After InitializeDevicesPass (iree-hal-initialize-devices) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
+module {
+  util.global private @__device_0 : !hal.device
+  util.initializer {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %0 = util.null : !hal.device
+    %device_count = hal.devices.count : index
+    %1:3 = scf.while (%arg0 = %c0, %arg1 = %c0, %arg2 = %0) : (index, index, !hal.device) -> (index, index, !hal.device) {
+      %4 = util.cmp.eq %arg2, %0 : !hal.device
+      %5 = arith.cmpi slt, %arg0, %device_count : index
+      %6 = arith.andi %4, %5 : i1
+      scf.condition(%6) %arg0, %arg1, %arg2 : index, index, !hal.device
+    } do {
+    ^bb0(%arg0: index, %arg1: index, %arg2: !hal.device):
+      %device_n = hal.devices.get %arg0 : !hal.device
+      %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
+      %4 = scf.if %value -> (i1) {
+        %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+        scf.yield %value_1 : i1
+      } else {
+        %false = arith.constant false
+        scf.yield %false : i1
+      }
+      %5 = arith.cmpi eq, %arg1, %c0 : index
+      %6 = arith.select %4, %c1, %c0 : index
+      %7 = arith.addi %arg1, %6 : index
+      %8 = arith.andi %4, %5 : i1
+      %9 = arith.select %8, %device_n, %0 : !hal.device
+      %10 = arith.addi %arg0, %c1 : index
+      scf.yield %10, %7, %9 : index, index, !hal.device
+    }
+    %2 = util.null : !hal.device
+    %3 = util.cmp.eq %1#2, %2 : !hal.device
+    scf.if %3 {
+      %c18_i32 = arith.constant 18 : i32
+      util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>"
+    }
+    util.global.store %1#2, @__device_0 : !hal.device
+    util.return
+  }
+  util.global private @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  util.initializer {
+    %__device_0 = util.global.load @__device_0 : !hal.device
+    %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+    util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+    util.return
+  }
+  util.global private @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  util.initializer {
+    %c-1_i64 = arith.constant -1 : i64
+    %c-1 = arith.constant -1 : index
+    %c0 = arith.constant 0 : index
+    %c14_i32 = arith.constant 14 : i32
+    %0 = util.null : !hal.executable
+    %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+    %__device_0 = util.global.load @__device_0 : !hal.device
+    %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index
+    %2 = arith.cmpi eq, %1, %c0 : index
+    %3 = scf.if %2 -> (!hal.executable) {
+      %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable
+      scf.yield %executable : !hal.executable
+    } else {
+      util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+      scf.yield %0 : !hal.executable
+    }
+    util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+    util.return
+  }
+  hal.executable private @multiple_results_dispatch_0 {
+    hal.executable.variant public @embedded_elf_arm_64 target(#executable_target_embedded_elf_arm_64) {
+      hal.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 ordinal(0) layout(#pipeline_layout) attributes {workgroup_size = [1 : index, 1 : index, 1 : index]}
+      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", llvm.target_triple = "arm64-unknown-unknown-eabi-elf"} {
+        llvm.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}) -> i32 {
+          %0 = llvm.mlir.constant(0 : i32) : i32
+          %1 = llvm.mlir.constant(8 : i64) : i64
+          %2 = llvm.mlir.constant(32 : i64) : i64
+          %3 = llvm.mlir.constant(64 : index) : i64
+          %4 = llvm.mlir.constant(true) : i1
+          %5 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
+          %6 = llvm.extractvalue %5[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+          %7 = llvm.load %6 : !llvm.ptr -> i32
+          %8 = llvm.zext %7 : i32 to i64
+          %9 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+          %10 = llvm.load %9 : !llvm.ptr -> !llvm.ptr
+          llvm.intr.assume %4 ["align"(%10, %3 : !llvm.ptr, i64)] : i1
+          %11 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
+          %12 = llvm.extractvalue %11[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+          %13 = llvm.getelementptr %12[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
+          %14 = llvm.load %13 : !llvm.ptr -> !llvm.ptr
+          %15 = llvm.mul %8, %1 : i64
+          %16 = llvm.udiv %15, %2 : i64
+          %17 = llvm.getelementptr %14[%16] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+          llvm.intr.assume %4 ["align"(%17, %3 : !llvm.ptr, i64)] : i1
+          %18 = llvm.load %10 {alignment = 4 : i64} : !llvm.ptr -> vector<2xf32>
+          %19 = llvm.intr.fabs(%18) : (vector<2xf32>) -> vector<2xf32>
+          llvm.store %19, %17 {alignment = 4 : i64} : vector<2xf32>, !llvm.ptr
+          llvm.return %0 : i32
+        }
+      }
+    }
+  }
+  util.func private @__multiple_results_memoize_apply(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} {
+    %c1 = arith.constant 1 : index
+    %c64_i32 = arith.constant 64 : i32
+    %c128 = arith.constant 128 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %c0 = arith.constant 0 : index
+    %c0_i32 = arith.constant 0 : i32
+    %c3 = arith.constant 3 : index
+    %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+    %cmd = hal.command_buffer.create device(%arg0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%arg1) bindings(%c3) : !hal.command_buffer
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([
+      (%c0 : index)[%c0, %c8], 
+      (%c2 : index)[%c0, %c128]
+    ]) flags("None")
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([
+      (%c1 : index)[%c0, %c8], 
+      (%c2 : index)[%c0, %c128]
+    ]) flags("None")
+    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
+    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
+    util.return %cmd : !hal.command_buffer
+  }
+  util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  util.initializer {
+    %c-1_i64 = arith.constant -1 : i64
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %0 = util.call @__multiple_results_memoize_apply(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer
+    util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    util.return
+  }
+  util.func private @__multiple_results_memoize_lookup(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer {
+    %0 = util.null : !hal.command_buffer
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %1 = util.cmp.eq %arg0, %__device_0 : !hal.device
+    %2 = scf.if %1 -> (!hal.command_buffer) {
+      %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+      scf.yield %__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    } else {
+      scf.yield %0 : !hal.command_buffer
+    }
+    util.return %2 : !hal.command_buffer
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c-1_i32 = arith.constant -1 : i32
+    %c0_i64 = arith.constant 0 : i64
+    %0 = util.null : !hal.fence
+    %c-1_i64 = arith.constant -1 : i64
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c64 = arith.constant 64 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32
+    %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
+    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
+    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
+    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128}
+    %1 = util.call @__multiple_results_memoize_lookup(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer
+    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+    hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%1) bindings([
+      (%buffer : !hal.buffer)[%c0, %c8], 
+      (%buffer_0 : !hal.buffer)[%c0, %c8], 
+      (%transient_buffer : !hal.buffer)[%c0, %c128]
+    ]) flags("None")
+    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32
+    util.status.check_ok %status, "failed to wait on timepoint"
+    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+    %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+    util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After IREECodegenAffineExpandIndexOpsPass (iree-codegen-affine-expand-index-ops) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
+module {
+  util.global private @__device_0 : !hal.device
+  util.initializer {
+    %c18_i32 = arith.constant 18 : i32
+    %false = arith.constant false
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %0 = util.null : !hal.device
+    %device_count = hal.devices.count : index
+    %1:3 = scf.while (%arg0 = %c0, %arg1 = %c0, %arg2 = %0) : (index, index, !hal.device) -> (index, index, !hal.device) {
+      %3 = util.cmp.eq %arg2, %0 : !hal.device
+      %4 = arith.cmpi slt, %arg0, %device_count : index
+      %5 = arith.andi %3, %4 : i1
+      scf.condition(%5) %arg0, %arg1, %arg2 : index, index, !hal.device
+    } do {
+    ^bb0(%arg0: index, %arg1: index, %arg2: !hal.device):
+      %device_n = hal.devices.get %arg0 : !hal.device
+      %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
+      %3 = scf.if %value -> (i1) {
+        %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+        scf.yield %value_1 : i1
+      } else {
+        scf.yield %false : i1
+      }
+      %4 = arith.cmpi eq, %arg1, %c0 : index
+      %5 = arith.select %3, %c1, %c0 : index
+      %6 = arith.addi %arg1, %5 : index
+      %7 = arith.andi %3, %4 : i1
+      %8 = arith.select %7, %device_n, %0 : !hal.device
+      %9 = arith.addi %arg0, %c1 : index
+      scf.yield %9, %6, %8 : index, index, !hal.device
+    }
+    %2 = util.cmp.eq %1#2, %0 : !hal.device
+    scf.if %2 {
+      util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>"
+    }
+    util.global.store %1#2, @__device_0 : !hal.device
+    util.return
+  }
+  util.global private @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  util.initializer {
+    %__device_0 = util.global.load @__device_0 : !hal.device
+    %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+    util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+    util.return
+  }
+  util.global private @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  util.initializer {
+    %c-1_i64 = arith.constant -1 : i64
+    %c-1 = arith.constant -1 : index
+    %c0 = arith.constant 0 : index
+    %c14_i32 = arith.constant 14 : i32
+    %0 = util.null : !hal.executable
+    %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+    %__device_0 = util.global.load @__device_0 : !hal.device
+    %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index
+    %2 = arith.cmpi eq, %1, %c0 : index
+    %3 = scf.if %2 -> (!hal.executable) {
+      %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable
+      scf.yield %executable : !hal.executable
+    } else {
+      util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+      scf.yield %0 : !hal.executable
+    }
+    util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+    util.return
+  }
+  hal.executable private @multiple_results_dispatch_0 {
+    hal.executable.variant public @embedded_elf_arm_64 target(#executable_target_embedded_elf_arm_64) {
+      hal.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 ordinal(0) layout(#pipeline_layout) attributes {workgroup_size = [1 : index, 1 : index, 1 : index]}
+      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", llvm.target_triple = "arm64-unknown-unknown-eabi-elf"} {
+        llvm.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}) -> i32 {
+          %0 = llvm.mlir.constant(0 : i32) : i32
+          %1 = llvm.mlir.constant(8 : i64) : i64
+          %2 = llvm.mlir.constant(32 : i64) : i64
+          %3 = llvm.mlir.constant(64 : index) : i64
+          %4 = llvm.mlir.constant(true) : i1
+          %5 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
+          %6 = llvm.extractvalue %5[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+          %7 = llvm.load %6 : !llvm.ptr -> i32
+          %8 = llvm.zext %7 : i32 to i64
+          %9 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+          %10 = llvm.load %9 : !llvm.ptr -> !llvm.ptr
+          llvm.intr.assume %4 ["align"(%10, %3 : !llvm.ptr, i64)] : i1
+          %11 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
+          %12 = llvm.extractvalue %11[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+          %13 = llvm.getelementptr %12[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
+          %14 = llvm.load %13 : !llvm.ptr -> !llvm.ptr
+          %15 = llvm.mul %8, %1 : i64
+          %16 = llvm.udiv %15, %2 : i64
+          %17 = llvm.getelementptr %14[%16] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+          llvm.intr.assume %4 ["align"(%17, %3 : !llvm.ptr, i64)] : i1
+          %18 = llvm.load %10 {alignment = 4 : i64} : !llvm.ptr -> vector<2xf32>
+          %19 = llvm.intr.fabs(%18) : (vector<2xf32>) -> vector<2xf32>
+          llvm.store %19, %17 {alignment = 4 : i64} : vector<2xf32>, !llvm.ptr
+          llvm.return %0 : i32
+        }
+      }
+    }
+  }
+  util.func private @__multiple_results_memoize_apply(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} {
+    %c1 = arith.constant 1 : index
+    %c64_i32 = arith.constant 64 : i32
+    %c128 = arith.constant 128 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %c0 = arith.constant 0 : index
+    %c0_i32 = arith.constant 0 : i32
+    %c3 = arith.constant 3 : index
+    %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+    %cmd = hal.command_buffer.create device(%arg0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%arg1) bindings(%c3) : !hal.command_buffer
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([
+      (%c0 : index)[%c0, %c8], 
+      (%c2 : index)[%c0, %c128]
+    ]) flags("None")
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([
+      (%c1 : index)[%c0, %c8], 
+      (%c2 : index)[%c0, %c128]
+    ]) flags("None")
+    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
+    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
+    util.return %cmd : !hal.command_buffer
+  }
+  util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  util.initializer {
+    %c-1_i64 = arith.constant -1 : i64
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %0 = util.call @__multiple_results_memoize_apply(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer
+    util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    util.return
+  }
+  util.func private @__multiple_results_memoize_lookup(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer {
+    %0 = util.null : !hal.command_buffer
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %1 = util.cmp.eq %arg0, %__device_0 : !hal.device
+    %2 = scf.if %1 -> (!hal.command_buffer) {
+      %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+      scf.yield %__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    } else {
+      scf.yield %0 : !hal.command_buffer
+    }
+    util.return %2 : !hal.command_buffer
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c-1_i32 = arith.constant -1 : i32
+    %c0_i64 = arith.constant 0 : i64
+    %0 = util.null : !hal.fence
+    %c-1_i64 = arith.constant -1 : i64
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c64 = arith.constant 64 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32
+    %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
+    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
+    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
+    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128}
+    %1 = util.call @__multiple_results_memoize_lookup(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer
+    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+    hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%1) bindings([
+      (%buffer : !hal.buffer)[%c0, %c8], 
+      (%buffer_0 : !hal.buffer)[%c0, %c8], 
+      (%transient_buffer : !hal.buffer)[%c0, %c128]
+    ]) flags("None")
+    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32
+    util.status.check_ok %status, "failed to wait on timepoint"
+    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+    %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+    util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After IREECodegenLowerAffinePass (iree-codegen-lower-affine) //----- //
+#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
+#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
+module {
+  util.global private @__device_0 : !hal.device
+  util.initializer {
+    %c18_i32 = arith.constant 18 : i32
+    %false = arith.constant false
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %0 = util.null : !hal.device
+    %device_count = hal.devices.count : index
+    %1:3 = scf.while (%arg0 = %c0, %arg1 = %c0, %arg2 = %0) : (index, index, !hal.device) -> (index, index, !hal.device) {
+      %3 = util.cmp.eq %arg2, %0 : !hal.device
+      %4 = arith.cmpi slt, %arg0, %device_count : index
+      %5 = arith.andi %3, %4 : i1
+      scf.condition(%5) %arg0, %arg1, %arg2 : index, index, !hal.device
+    } do {
+    ^bb0(%arg0: index, %arg1: index, %arg2: !hal.device):
+      %device_n = hal.devices.get %arg0 : !hal.device
+      %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
+      %3 = scf.if %value -> (i1) {
+        %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+        scf.yield %value_1 : i1
+      } else {
+        scf.yield %false : i1
+      }
+      %4 = arith.cmpi eq, %arg1, %c0 : index
+      %5 = arith.select %3, %c1, %c0 : index
+      %6 = arith.addi %arg1, %5 : index
+      %7 = arith.andi %3, %4 : i1
+      %8 = arith.select %7, %device_n, %0 : !hal.device
+      %9 = arith.addi %arg0, %c1 : index
+      scf.yield %9, %6, %8 : index, index, !hal.device
+    }
+    %2 = util.cmp.eq %1#2, %0 : !hal.device
+    scf.if %2 {
+      util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>"
+    }
+    util.global.store %1#2, @__device_0 : !hal.device
+    util.return
+  }
+  util.global private @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  util.initializer {
+    %__device_0 = util.global.load @__device_0 : !hal.device
+    %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+    util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+    util.return
+  }
+  util.global private @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  util.initializer {
+    %c-1_i64 = arith.constant -1 : i64
+    %c-1 = arith.constant -1 : index
+    %c0 = arith.constant 0 : index
+    %c14_i32 = arith.constant 14 : i32
+    %0 = util.null : !hal.executable
+    %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+    %__device_0 = util.global.load @__device_0 : !hal.device
+    %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index
+    %2 = arith.cmpi eq, %1, %c0 : index
+    %3 = scf.if %2 -> (!hal.executable) {
+      %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable
+      scf.yield %executable : !hal.executable
+    } else {
+      util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+      scf.yield %0 : !hal.executable
+    }
+    util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+    util.return
+  }
+  hal.executable private @multiple_results_dispatch_0 {
+    hal.executable.variant public @embedded_elf_arm_64 target(#executable_target_embedded_elf_arm_64) {
+      hal.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 ordinal(0) layout(#pipeline_layout) attributes {workgroup_size = [1 : index, 1 : index, 1 : index]}
+      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", llvm.target_triple = "arm64-unknown-unknown-eabi-elf"} {
+        llvm.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}) -> i32 {
+          %0 = llvm.mlir.constant(0 : i32) : i32
+          %1 = llvm.mlir.constant(8 : i64) : i64
+          %2 = llvm.mlir.constant(32 : i64) : i64
+          %3 = llvm.mlir.constant(64 : index) : i64
+          %4 = llvm.mlir.constant(true) : i1
+          %5 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
+          %6 = llvm.extractvalue %5[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+          %7 = llvm.load %6 : !llvm.ptr -> i32
+          %8 = llvm.zext %7 : i32 to i64
+          %9 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+          %10 = llvm.load %9 : !llvm.ptr -> !llvm.ptr
+          llvm.intr.assume %4 ["align"(%10, %3 : !llvm.ptr, i64)] : i1
+          %11 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
+          %12 = llvm.extractvalue %11[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> 
+          %13 = llvm.getelementptr %12[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
+          %14 = llvm.load %13 : !llvm.ptr -> !llvm.ptr
+          %15 = llvm.mul %8, %1 : i64
+          %16 = llvm.udiv %15, %2 : i64
+          %17 = llvm.getelementptr %14[%16] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+          llvm.intr.assume %4 ["align"(%17, %3 : !llvm.ptr, i64)] : i1
+          %18 = llvm.load %10 {alignment = 4 : i64} : !llvm.ptr -> vector<2xf32>
+          %19 = llvm.intr.fabs(%18) : (vector<2xf32>) -> vector<2xf32>
+          llvm.store %19, %17 {alignment = 4 : i64} : vector<2xf32>, !llvm.ptr
+          llvm.return %0 : i32
+        }
+      }
+    }
+  }
+  util.func private @__multiple_results_memoize_apply(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} {
+    %c1 = arith.constant 1 : index
+    %c64_i32 = arith.constant 64 : i32
+    %c128 = arith.constant 128 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %c0 = arith.constant 0 : index
+    %c0_i32 = arith.constant 0 : i32
+    %c3 = arith.constant 3 : index
+    %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+    %cmd = hal.command_buffer.create device(%arg0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%arg1) bindings(%c3) : !hal.command_buffer
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([
+      (%c0 : index)[%c0, %c8], 
+      (%c2 : index)[%c0, %c128]
+    ]) flags("None")
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([
+      (%c1 : index)[%c0, %c8], 
+      (%c2 : index)[%c0, %c128]
+    ]) flags("None")
+    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
+    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
+    util.return %cmd : !hal.command_buffer
+  }
+  util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  util.initializer {
+    %c-1_i64 = arith.constant -1 : i64
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %0 = util.call @__multiple_results_memoize_apply(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer
+    util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    util.return
+  }
+  util.func private @__multiple_results_memoize_lookup(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer {
+    %0 = util.null : !hal.command_buffer
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %1 = util.cmp.eq %arg0, %__device_0 : !hal.device
+    %2 = scf.if %1 -> (!hal.command_buffer) {
+      %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+      scf.yield %__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    } else {
+      scf.yield %0 : !hal.command_buffer
+    }
+    util.return %2 : !hal.command_buffer
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c-1_i32 = arith.constant -1 : i32
+    %c0_i64 = arith.constant 0 : i64
+    %0 = util.null : !hal.fence
+    %c-1_i64 = arith.constant -1 : i64
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c64 = arith.constant 64 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32
+    %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
+    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
+    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
+    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128}
+    %1 = util.call @__multiple_results_memoize_lookup(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer
+    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+    hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%1) bindings([
+      (%buffer : !hal.buffer)[%c0, %c8], 
+      (%buffer_0 : !hal.buffer)[%c0, %c8], 
+      (%transient_buffer : !hal.buffer)[%c0, %c128]
+    ]) flags("None")
+    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32
+    util.status.check_ok %status, "failed to wait on timepoint"
+    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+    %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+    util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After SCFToControlFlowPass (convert-scf-to-cf) //----- //
+util.initializer {
+  %__device_0 = util.global.load @__device_0 : !hal.device
+  %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+  util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  util.return
+}
+
+// -----// IR Dump After SCFToControlFlowPass (convert-scf-to-cf) //----- //
+util.initializer {
+  %c18_i32 = arith.constant 18 : i32
+  %false = arith.constant false
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %0 = util.null : !hal.device
+  %device_count = hal.devices.count : index
+  cf.br ^bb1(%c0, %c0, %0 : index, index, !hal.device)
+^bb1(%1: index, %2: index, %3: !hal.device):  // 2 preds: ^bb0, ^bb6
+  %4 = util.cmp.eq %3, %0 : !hal.device
+  %5 = arith.cmpi slt, %1, %device_count : index
+  %6 = arith.andi %4, %5 : i1
+  cf.cond_br %6, ^bb2(%1, %2, %3 : index, index, !hal.device), ^bb7
+^bb2(%7: index, %8: index, %9: !hal.device):  // pred: ^bb1
+  %device_n = hal.devices.get %7 : !hal.device
+  %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
+  cf.cond_br %value, ^bb3, ^bb4
+^bb3:  // pred: ^bb2
+  %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+  cf.br ^bb5(%value_1 : i1)
+^bb4:  // pred: ^bb2
+  cf.br ^bb5(%false : i1)
+^bb5(%10: i1):  // 2 preds: ^bb3, ^bb4
+  cf.br ^bb6
+^bb6:  // pred: ^bb5
+  %11 = arith.cmpi eq, %8, %c0 : index
+  %12 = arith.select %10, %c1, %c0 : index
+  %13 = arith.addi %8, %12 : index
+  %14 = arith.andi %10, %11 : i1
+  %15 = arith.select %14, %device_n, %0 : !hal.device
+  %16 = arith.addi %7, %c1 : index
+  cf.br ^bb1(%16, %13, %15 : index, index, !hal.device)
+^bb7:  // pred: ^bb1
+  %17 = util.cmp.eq %3, %0 : !hal.device
+  cf.cond_br %17, ^bb8, ^bb9
+^bb8:  // pred: ^bb7
+  util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>"
+  cf.br ^bb9
+^bb9:  // 2 preds: ^bb7, ^bb8
+  util.global.store %3, @__device_0 : !hal.device
+  util.return
+}
+
+// -----// IR Dump After SCFToControlFlowPass (convert-scf-to-cf) //----- //
+util.initializer {
+  %c-1_i64 = arith.constant -1 : i64
+  %c-1 = arith.constant -1 : index
+  %c0 = arith.constant 0 : index
+  %c14_i32 = arith.constant 14 : i32
+  %0 = util.null : !hal.executable
+  %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  %__device_0 = util.global.load @__device_0 : !hal.device
+  %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index
+  %2 = arith.cmpi eq, %1, %c0 : index
+  cf.cond_br %2, ^bb1, ^bb2
+^bb1:  // pred: ^bb0
+  %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable
+  cf.br ^bb3(%executable : !hal.executable)
+^bb2:  // pred: ^bb0
+  util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+  cf.br ^bb3(%0 : !hal.executable)
+^bb3(%3: !hal.executable):  // 2 preds: ^bb1, ^bb2
+  cf.br ^bb4
+^bb4:  // pred: ^bb3
+  util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  util.return
+}
+
+// -----// IR Dump After SCFToControlFlowPass (convert-scf-to-cf) //----- //
+util.func private @__multiple_results_memoize_lookup(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer {
+  %0 = util.null : !hal.command_buffer
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %1 = util.cmp.eq %arg0, %__device_0 : !hal.device
+  cf.cond_br %1, ^bb1, ^bb2
+^bb1:  // pred: ^bb0
+  %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  cf.br ^bb3(%__multiple_results_memoize_result_0_device_0 : !hal.command_buffer)
+^bb2:  // pred: ^bb0
+  cf.br ^bb3(%0 : !hal.command_buffer)
+^bb3(%2: !hal.command_buffer):  // 2 preds: ^bb1, ^bb2
+  cf.br ^bb4
+^bb4:  // pred: ^bb3
+  util.return %2 : !hal.command_buffer
+}
+
+// -----// IR Dump After SCFToControlFlowPass (convert-scf-to-cf) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c-1_i32 = arith.constant -1 : i32
+  %c0_i64 = arith.constant 0 : i64
+  %0 = util.null : !hal.fence
+  %c-1_i64 = arith.constant -1 : i64
+  %c0 = arith.constant 0 : index
+  %c128 = arith.constant 128 : index
+  %c64 = arith.constant 64 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32
+  %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
+  %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
+  hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
+  hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+  %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+  %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128}
+  %1 = util.call @__multiple_results_memoize_lookup(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer
+  %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+  hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%1) bindings([
+    (%buffer : !hal.buffer)[%c0, %c8], 
+    (%buffer_0 : !hal.buffer)[%c0, %c8], 
+    (%transient_buffer : !hal.buffer)[%c0, %c128]
+  ]) flags("None")
+  %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32
+  util.status.check_ok %status, "failed to wait on timepoint"
+  %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+  %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+  util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After SCFToControlFlowPass (convert-scf-to-cf) //----- //
+util.func private @__multiple_results_memoize_apply(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} {
+  %c1 = arith.constant 1 : index
+  %c64_i32 = arith.constant 64 : i32
+  %c128 = arith.constant 128 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %c0 = arith.constant 0 : index
+  %c0_i32 = arith.constant 0 : i32
+  %c3 = arith.constant 3 : index
+  %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  %cmd = hal.command_buffer.create device(%arg0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%arg1) bindings(%c3) : !hal.command_buffer
+  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([
+    (%c0 : index)[%c0, %c8], 
+    (%c2 : index)[%c0, %c128]
+  ]) flags("None")
+  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([
+    (%c1 : index)[%c0, %c8], 
+    (%c2 : index)[%c0, %c128]
+  ]) flags("None")
+  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
+  hal.command_buffer.finalize<%cmd : !hal.command_buffer>
+  util.return %cmd : !hal.command_buffer
+}
+
+// -----// IR Dump After SCFToControlFlowPass (convert-scf-to-cf) //----- //
+util.initializer {
+  %c-1_i64 = arith.constant -1 : i64
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %0 = util.call @__multiple_results_memoize_apply(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer
+  util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  util.return
+}
+
+// -----// IR Dump After SerializeTargetExecutablesPass (iree-hal-serialize-target-executables) //----- //
+hal.executable private @multiple_results_dispatch_0 {
+  hal.executable.binary public @embedded_elf_arm_64 attributes {data = dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8>, format = "embedded-elf-arm_64", mime_type = "application/x-elf"}
+}
+
+// -----// IR Dump After SerializeAllExecutablesPass (iree-hal-serialize-all-executables) //----- //
+hal.executable private @multiple_results_dispatch_0 {
+  hal.executable.binary public @embedded_elf_arm_64 attributes {data = dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8>, format = "embedded-elf-arm_64", mime_type = "application/x-elf"}
+}
+
+// -----// IR Dump After PruneExecutablesPass (iree-hal-prune-executables) //----- //
+module {
+  util.global private @__device_0 : !hal.device
+  util.initializer {
+    %c18_i32 = arith.constant 18 : i32
+    %false = arith.constant false
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %0 = util.null : !hal.device
+    %device_count = hal.devices.count : index
+    cf.br ^bb1(%c0, %c0, %0 : index, index, !hal.device)
+  ^bb1(%1: index, %2: index, %3: !hal.device):  // 2 preds: ^bb0, ^bb6
+    %4 = util.cmp.eq %3, %0 : !hal.device
+    %5 = arith.cmpi slt, %1, %device_count : index
+    %6 = arith.andi %4, %5 : i1
+    cf.cond_br %6, ^bb2(%1, %2, %3 : index, index, !hal.device), ^bb7
+  ^bb2(%7: index, %8: index, %9: !hal.device):  // pred: ^bb1
+    %device_n = hal.devices.get %7 : !hal.device
+    %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
+    cf.cond_br %value, ^bb3, ^bb4
+  ^bb3:  // pred: ^bb2
+    %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+    cf.br ^bb5(%value_1 : i1)
+  ^bb4:  // pred: ^bb2
+    cf.br ^bb5(%false : i1)
+  ^bb5(%10: i1):  // 2 preds: ^bb3, ^bb4
+    cf.br ^bb6
+  ^bb6:  // pred: ^bb5
+    %11 = arith.cmpi eq, %8, %c0 : index
+    %12 = arith.select %10, %c1, %c0 : index
+    %13 = arith.addi %8, %12 : index
+    %14 = arith.andi %10, %11 : i1
+    %15 = arith.select %14, %device_n, %0 : !hal.device
+    %16 = arith.addi %7, %c1 : index
+    cf.br ^bb1(%16, %13, %15 : index, index, !hal.device)
+  ^bb7:  // pred: ^bb1
+    %17 = util.cmp.eq %3, %0 : !hal.device
+    cf.cond_br %17, ^bb8, ^bb9
+  ^bb8:  // pred: ^bb7
+    util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>"
+    cf.br ^bb9
+  ^bb9:  // 2 preds: ^bb7, ^bb8
+    util.global.store %3, @__device_0 : !hal.device
+    util.return
+  }
+  util.global private @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  util.initializer {
+    %__device_0 = util.global.load @__device_0 : !hal.device
+    %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+    util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+    util.return
+  }
+  util.global private @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  util.initializer {
+    %c-1_i64 = arith.constant -1 : i64
+    %c-1 = arith.constant -1 : index
+    %c0 = arith.constant 0 : index
+    %c14_i32 = arith.constant 14 : i32
+    %0 = util.null : !hal.executable
+    %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+    %__device_0 = util.global.load @__device_0 : !hal.device
+    %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index
+    %2 = arith.cmpi eq, %1, %c0 : index
+    cf.cond_br %2, ^bb1, ^bb2
+  ^bb1:  // pred: ^bb0
+    %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable
+    cf.br ^bb3(%executable : !hal.executable)
+  ^bb2:  // pred: ^bb0
+    util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+    cf.br ^bb3(%0 : !hal.executable)
+  ^bb3(%3: !hal.executable):  // 2 preds: ^bb1, ^bb2
+    cf.br ^bb4
+  ^bb4:  // pred: ^bb3
+    util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+    util.return
+  }
+  hal.executable private @multiple_results_dispatch_0 {
+    hal.executable.binary public @embedded_elf_arm_64 attributes {data = dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8>, format = "embedded-elf-arm_64", mime_type = "application/x-elf"}
+  }
+  util.func private @__multiple_results_memoize_apply(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} {
+    %c1 = arith.constant 1 : index
+    %c64_i32 = arith.constant 64 : i32
+    %c128 = arith.constant 128 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %c0 = arith.constant 0 : index
+    %c0_i32 = arith.constant 0 : i32
+    %c3 = arith.constant 3 : index
+    %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+    %cmd = hal.command_buffer.create device(%arg0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%arg1) bindings(%c3) : !hal.command_buffer
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([
+      (%c0 : index)[%c0, %c8], 
+      (%c2 : index)[%c0, %c128]
+    ]) flags("None")
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([
+      (%c1 : index)[%c0, %c8], 
+      (%c2 : index)[%c0, %c128]
+    ]) flags("None")
+    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
+    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
+    util.return %cmd : !hal.command_buffer
+  }
+  util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  util.initializer {
+    %c-1_i64 = arith.constant -1 : i64
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %0 = util.call @__multiple_results_memoize_apply(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer
+    util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    util.return
+  }
+  util.func private @__multiple_results_memoize_lookup(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer {
+    %0 = util.null : !hal.command_buffer
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %1 = util.cmp.eq %arg0, %__device_0 : !hal.device
+    cf.cond_br %1, ^bb1, ^bb2
+  ^bb1:  // pred: ^bb0
+    %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    cf.br ^bb3(%__multiple_results_memoize_result_0_device_0 : !hal.command_buffer)
+  ^bb2:  // pred: ^bb0
+    cf.br ^bb3(%0 : !hal.command_buffer)
+  ^bb3(%2: !hal.command_buffer):  // 2 preds: ^bb1, ^bb2
+    cf.br ^bb4
+  ^bb4:  // pred: ^bb3
+    util.return %2 : !hal.command_buffer
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c-1_i32 = arith.constant -1 : i32
+    %c0_i64 = arith.constant 0 : i64
+    %0 = util.null : !hal.fence
+    %c-1_i64 = arith.constant -1 : i64
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c64 = arith.constant 64 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32
+    %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
+    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
+    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
+    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128}
+    %1 = util.call @__multiple_results_memoize_lookup(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer
+    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+    hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%1) bindings([
+      (%buffer : !hal.buffer)[%c0, %c8], 
+      (%buffer_0 : !hal.buffer)[%c0, %c8], 
+      (%transient_buffer : !hal.buffer)[%c0, %c128]
+    ]) flags("None")
+    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32
+    util.status.check_ok %status, "failed to wait on timepoint"
+    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+    %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+    util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After SymbolDCE (symbol-dce) //----- //
+module {
+  util.global private @__device_0 : !hal.device
+  util.initializer {
+    %c18_i32 = arith.constant 18 : i32
+    %false = arith.constant false
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %0 = util.null : !hal.device
+    %device_count = hal.devices.count : index
+    cf.br ^bb1(%c0, %c0, %0 : index, index, !hal.device)
+  ^bb1(%1: index, %2: index, %3: !hal.device):  // 2 preds: ^bb0, ^bb6
+    %4 = util.cmp.eq %3, %0 : !hal.device
+    %5 = arith.cmpi slt, %1, %device_count : index
+    %6 = arith.andi %4, %5 : i1
+    cf.cond_br %6, ^bb2(%1, %2, %3 : index, index, !hal.device), ^bb7
+  ^bb2(%7: index, %8: index, %9: !hal.device):  // pred: ^bb1
+    %device_n = hal.devices.get %7 : !hal.device
+    %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
+    cf.cond_br %value, ^bb3, ^bb4
+  ^bb3:  // pred: ^bb2
+    %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+    cf.br ^bb5(%value_1 : i1)
+  ^bb4:  // pred: ^bb2
+    cf.br ^bb5(%false : i1)
+  ^bb5(%10: i1):  // 2 preds: ^bb3, ^bb4
+    cf.br ^bb6
+  ^bb6:  // pred: ^bb5
+    %11 = arith.cmpi eq, %8, %c0 : index
+    %12 = arith.select %10, %c1, %c0 : index
+    %13 = arith.addi %8, %12 : index
+    %14 = arith.andi %10, %11 : i1
+    %15 = arith.select %14, %device_n, %0 : !hal.device
+    %16 = arith.addi %7, %c1 : index
+    cf.br ^bb1(%16, %13, %15 : index, index, !hal.device)
+  ^bb7:  // pred: ^bb1
+    %17 = util.cmp.eq %3, %0 : !hal.device
+    cf.cond_br %17, ^bb8, ^bb9
+  ^bb8:  // pred: ^bb7
+    util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>"
+    cf.br ^bb9
+  ^bb9:  // 2 preds: ^bb7, ^bb8
+    util.global.store %3, @__device_0 : !hal.device
+    util.return
+  }
+  util.global private @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  util.initializer {
+    %__device_0 = util.global.load @__device_0 : !hal.device
+    %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+    util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+    util.return
+  }
+  util.global private @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  util.initializer {
+    %c-1_i64 = arith.constant -1 : i64
+    %c-1 = arith.constant -1 : index
+    %c0 = arith.constant 0 : index
+    %c14_i32 = arith.constant 14 : i32
+    %0 = util.null : !hal.executable
+    %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+    %__device_0 = util.global.load @__device_0 : !hal.device
+    %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index
+    %2 = arith.cmpi eq, %1, %c0 : index
+    cf.cond_br %2, ^bb1, ^bb2
+  ^bb1:  // pred: ^bb0
+    %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable
+    cf.br ^bb3(%executable : !hal.executable)
+  ^bb2:  // pred: ^bb0
+    util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+    cf.br ^bb3(%0 : !hal.executable)
+  ^bb3(%3: !hal.executable):  // 2 preds: ^bb1, ^bb2
+    cf.br ^bb4
+  ^bb4:  // pred: ^bb3
+    util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+    util.return
+  }
+  hal.executable private @multiple_results_dispatch_0 {
+    hal.executable.binary public @embedded_elf_arm_64 attributes {data = dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8>, format = "embedded-elf-arm_64", mime_type = "application/x-elf"}
+  }
+  util.func private @__multiple_results_memoize_apply(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} {
+    %c1 = arith.constant 1 : index
+    %c64_i32 = arith.constant 64 : i32
+    %c128 = arith.constant 128 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %c0 = arith.constant 0 : index
+    %c0_i32 = arith.constant 0 : i32
+    %c3 = arith.constant 3 : index
+    %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+    %cmd = hal.command_buffer.create device(%arg0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%arg1) bindings(%c3) : !hal.command_buffer
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([
+      (%c0 : index)[%c0, %c8], 
+      (%c2 : index)[%c0, %c128]
+    ]) flags("None")
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([
+      (%c1 : index)[%c0, %c8], 
+      (%c2 : index)[%c0, %c128]
+    ]) flags("None")
+    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
+    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
+    util.return %cmd : !hal.command_buffer
+  }
+  util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  util.initializer {
+    %c-1_i64 = arith.constant -1 : i64
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %0 = util.call @__multiple_results_memoize_apply(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer
+    util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    util.return
+  }
+  util.func private @__multiple_results_memoize_lookup(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer {
+    %0 = util.null : !hal.command_buffer
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %1 = util.cmp.eq %arg0, %__device_0 : !hal.device
+    cf.cond_br %1, ^bb1, ^bb2
+  ^bb1:  // pred: ^bb0
+    %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    cf.br ^bb3(%__multiple_results_memoize_result_0_device_0 : !hal.command_buffer)
+  ^bb2:  // pred: ^bb0
+    cf.br ^bb3(%0 : !hal.command_buffer)
+  ^bb3(%2: !hal.command_buffer):  // 2 preds: ^bb1, ^bb2
+    cf.br ^bb4
+  ^bb4:  // pred: ^bb3
+    util.return %2 : !hal.command_buffer
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c-1_i32 = arith.constant -1 : i32
+    %c0_i64 = arith.constant 0 : i64
+    %0 = util.null : !hal.fence
+    %c-1_i64 = arith.constant -1 : i64
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c64 = arith.constant 64 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32
+    %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
+    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
+    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
+    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128}
+    %1 = util.call @__multiple_results_memoize_lookup(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer
+    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+    hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%1) bindings([
+      (%buffer : !hal.buffer)[%c0, %c8], 
+      (%buffer_0 : !hal.buffer)[%c0, %c8], 
+      (%transient_buffer : !hal.buffer)[%c0, %c128]
+    ]) flags("None")
+    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32
+    util.status.check_ok %status, "failed to wait on timepoint"
+    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+    %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+    util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+util.initializer {
+  %__device_0 = util.global.load @__device_0 : !hal.device
+  %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+  util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  util.return
+}
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+util.initializer {
+  %c-1_i64 = arith.constant -1 : i64
+  %c-1 = arith.constant -1 : index
+  %c0 = arith.constant 0 : index
+  %c14_i32 = arith.constant 14 : i32
+  %0 = util.null : !hal.executable
+  %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  %__device_0 = util.global.load @__device_0 : !hal.device
+  %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index
+  %2 = arith.cmpi eq, %1, %c0 : index
+  cf.cond_br %2, ^bb1, ^bb2
+^bb1:  // pred: ^bb0
+  %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable
+  cf.br ^bb3(%executable : !hal.executable)
+^bb2:  // pred: ^bb0
+  util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+  cf.br ^bb3(%0 : !hal.executable)
+^bb3(%3: !hal.executable):  // 2 preds: ^bb1, ^bb2
+  util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  util.return
+}
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+util.func private @__multiple_results_memoize_lookup(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer {
+  %0 = util.null : !hal.command_buffer
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %1 = util.cmp.eq %arg0, %__device_0 : !hal.device
+  cf.cond_br %1, ^bb1, ^bb2(%0 : !hal.command_buffer)
+^bb1:  // pred: ^bb0
+  %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  cf.br ^bb2(%__multiple_results_memoize_result_0_device_0 : !hal.command_buffer)
+^bb2(%2: !hal.command_buffer):  // 2 preds: ^bb0, ^bb1
+  util.return %2 : !hal.command_buffer
+}
+
+// -----// IR Dump After CSE (cse) //----- //
+util.func private @__multiple_results_memoize_lookup(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer {
+  %0 = util.null : !hal.command_buffer
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %1 = util.cmp.eq %arg0, %__device_0 : !hal.device
+  cf.cond_br %1, ^bb1, ^bb2(%0 : !hal.command_buffer)
+^bb1:  // pred: ^bb0
+  %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  cf.br ^bb2(%__multiple_results_memoize_result_0_device_0 : !hal.command_buffer)
+^bb2(%2: !hal.command_buffer):  // 2 preds: ^bb0, ^bb1
+  util.return %2 : !hal.command_buffer
+}
+
+// -----// IR Dump After CSE (cse) //----- //
+util.initializer {
+  %c-1_i64 = arith.constant -1 : i64
+  %c-1 = arith.constant -1 : index
+  %c0 = arith.constant 0 : index
+  %c14_i32 = arith.constant 14 : i32
+  %0 = util.null : !hal.executable
+  %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  %__device_0 = util.global.load @__device_0 : !hal.device
+  %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index
+  %2 = arith.cmpi eq, %1, %c0 : index
+  cf.cond_br %2, ^bb1, ^bb2
+^bb1:  // pred: ^bb0
+  %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable
+  cf.br ^bb3(%executable : !hal.executable)
+^bb2:  // pred: ^bb0
+  util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+  cf.br ^bb3(%0 : !hal.executable)
+^bb3(%3: !hal.executable):  // 2 preds: ^bb1, ^bb2
+  util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  util.return
+}
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+util.func private @__multiple_results_memoize_apply(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} {
+  %c1 = arith.constant 1 : index
+  %c64_i32 = arith.constant 64 : i32
+  %c128 = arith.constant 128 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %c0 = arith.constant 0 : index
+  %c0_i32 = arith.constant 0 : i32
+  %c3 = arith.constant 3 : index
+  %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  %cmd = hal.command_buffer.create device(%arg0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%arg1) bindings(%c3) : !hal.command_buffer
+  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([
+    (%c0 : index)[%c0, %c8], 
+    (%c2 : index)[%c0, %c128]
+  ]) flags("None")
+  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([
+    (%c1 : index)[%c0, %c8], 
+    (%c2 : index)[%c0, %c128]
+  ]) flags("None")
+  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
+  hal.command_buffer.finalize<%cmd : !hal.command_buffer>
+  util.return %cmd : !hal.command_buffer
+}
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+util.initializer {
+  %c18_i32 = arith.constant 18 : i32
+  %false = arith.constant false
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %0 = util.null : !hal.device
+  %device_count = hal.devices.count : index
+  cf.br ^bb1(%c0, %c0, %0 : index, index, !hal.device)
+^bb1(%1: index, %2: index, %3: !hal.device):  // 2 preds: ^bb0, ^bb4
+  %4 = util.cmp.eq %3, %0 : !hal.device
+  %5 = arith.cmpi slt, %1, %device_count : index
+  %6 = arith.andi %4, %5 : i1
+  cf.cond_br %6, ^bb2(%1, %2 : index, index), ^bb5
+^bb2(%7: index, %8: index):  // pred: ^bb1
+  %device_n = hal.devices.get %7 : !hal.device
+  %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
+  cf.cond_br %value, ^bb3, ^bb4(%false : i1)
+^bb3:  // pred: ^bb2
+  %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+  cf.br ^bb4(%value_1 : i1)
+^bb4(%9: i1):  // 2 preds: ^bb2, ^bb3
+  %10 = arith.cmpi eq, %8, %c0 : index
+  %11 = arith.select %9, %c1, %c0 : index
+  %12 = arith.addi %8, %11 : index
+  %13 = arith.andi %9, %10 : i1
+  %14 = arith.select %13, %device_n, %0 : !hal.device
+  %15 = arith.addi %7, %c1 : index
+  cf.br ^bb1(%15, %12, %14 : index, index, !hal.device)
+^bb5:  // pred: ^bb1
+  %16 = util.cmp.eq %3, %0 : !hal.device
+  cf.cond_br %16, ^bb6, ^bb7
+^bb6:  // pred: ^bb5
+  util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>"
+  cf.br ^bb7
+^bb7:  // 2 preds: ^bb5, ^bb6
+  util.global.store %3, @__device_0 : !hal.device
+  util.return
+}
+
+// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- //
+util.initializer {
+  %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  %__device_0 = util.global.load @__device_0 : !hal.device
+  %c-1_i64 = arith.constant -1 : i64
+  %c-1 = arith.constant -1 : index
+  %c0 = arith.constant 0 : index
+  %c14_i32 = arith.constant 14 : i32
+  %0 = util.null : !hal.executable
+  %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index
+  %2 = arith.cmpi eq, %1, %c0 : index
+  cf.cond_br %2, ^bb1, ^bb2
+^bb1:  // pred: ^bb0
+  %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable
+  cf.br ^bb3(%executable : !hal.executable)
+^bb2:  // pred: ^bb0
+  util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+  cf.br ^bb3(%0 : !hal.executable)
+^bb3(%3: !hal.executable):  // 2 preds: ^bb1, ^bb2
+  util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  util.return
+}
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c-1_i32 = arith.constant -1 : i32
+  %c0_i64 = arith.constant 0 : i64
+  %0 = util.null : !hal.fence
+  %c-1_i64 = arith.constant -1 : i64
+  %c0 = arith.constant 0 : index
+  %c128 = arith.constant 128 : index
+  %c64 = arith.constant 64 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32
+  %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
+  %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
+  hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
+  hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+  %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+  %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128}
+  %1 = util.call @__multiple_results_memoize_lookup(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer
+  %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+  hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%1) bindings([
+    (%buffer : !hal.buffer)[%c0, %c8], 
+    (%buffer_0 : !hal.buffer)[%c0, %c8], 
+    (%transient_buffer : !hal.buffer)[%c0, %c128]
+  ]) flags("None")
+  %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32
+  util.status.check_ok %status, "failed to wait on timepoint"
+  %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+  %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+  util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After CSE (cse) //----- //
+util.func private @__multiple_results_memoize_apply(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} {
+  %c1 = arith.constant 1 : index
+  %c64_i32 = arith.constant 64 : i32
+  %c128 = arith.constant 128 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %c0 = arith.constant 0 : index
+  %c0_i32 = arith.constant 0 : i32
+  %c3 = arith.constant 3 : index
+  %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  %cmd = hal.command_buffer.create device(%arg0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%arg1) bindings(%c3) : !hal.command_buffer
+  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([
+    (%c0 : index)[%c0, %c8], 
+    (%c2 : index)[%c0, %c128]
+  ]) flags("None")
+  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([
+    (%c1 : index)[%c0, %c8], 
+    (%c2 : index)[%c0, %c128]
+  ]) flags("None")
+  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
+  hal.command_buffer.finalize<%cmd : !hal.command_buffer>
+  util.return %cmd : !hal.command_buffer
+}
+
+// -----// IR Dump After CSE (cse) //----- //
+util.initializer {
+  %__device_0 = util.global.load @__device_0 : !hal.device
+  %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+  util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  util.return
+}
+
+// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- //
+util.initializer {
+  %__device_0 = util.global.load @__device_0 : !hal.device
+  %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+  util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  util.return
+}
+
+// -----// IR Dump After CSE (cse) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c-1_i32 = arith.constant -1 : i32
+  %c0_i64 = arith.constant 0 : i64
+  %0 = util.null : !hal.fence
+  %c-1_i64 = arith.constant -1 : i64
+  %c0 = arith.constant 0 : index
+  %c128 = arith.constant 128 : index
+  %c64 = arith.constant 64 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32
+  %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
+  %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
+  hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
+  hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+  %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+  %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128}
+  %1 = util.call @__multiple_results_memoize_lookup(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer
+  %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+  hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%1) bindings([
+    (%buffer : !hal.buffer)[%c0, %c8], 
+    (%buffer_0 : !hal.buffer)[%c0, %c8], 
+    (%transient_buffer : !hal.buffer)[%c0, %c128]
+  ]) flags("None")
+  %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32
+  util.status.check_ok %status, "failed to wait on timepoint"
+  %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+  %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+  util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- //
+util.func private @__multiple_results_memoize_apply(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} {
+  %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  %c1 = arith.constant 1 : index
+  %c64_i32 = arith.constant 64 : i32
+  %c128 = arith.constant 128 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %c0 = arith.constant 0 : index
+  %c0_i32 = arith.constant 0 : i32
+  %c3 = arith.constant 3 : index
+  %cmd = hal.command_buffer.create device(%arg0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%arg1) bindings(%c3) : !hal.command_buffer
+  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([
+    (%c0 : index)[%c0, %c8], 
+    (%c2 : index)[%c0, %c128]
+  ]) flags("None")
+  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([
+    (%c1 : index)[%c0, %c8], 
+    (%c2 : index)[%c0, %c128]
+  ]) flags("None")
+  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
+  hal.command_buffer.finalize<%cmd : !hal.command_buffer>
+  util.return %cmd : !hal.command_buffer
+}
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+util.initializer {
+  %c-1_i64 = arith.constant -1 : i64
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %0 = util.call @__multiple_results_memoize_apply(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer
+  util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  util.return
+}
+
+// -----// IR Dump After CSE (cse) //----- //
+util.initializer {
+  %c-1_i64 = arith.constant -1 : i64
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %0 = util.call @__multiple_results_memoize_apply(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer
+  util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  util.return
+}
+
+// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- //
+util.initializer {
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %c-1_i64 = arith.constant -1 : i64
+  %0 = util.call @__multiple_results_memoize_apply(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer
+  util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  util.return
+}
+
+// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- //
+util.initializer {
+  %0 = util.null : !hal.executable
+  %c14_i32 = arith.constant 14 : i32
+  %c0 = arith.constant 0 : index
+  %c-1 = arith.constant -1 : index
+  %c-1_i64 = arith.constant -1 : i64
+  %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  %__device_0 = util.global.load @__device_0 : !hal.device
+  %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index
+  %2 = arith.cmpi eq, %1, %c0 : index
+  cf.cond_br %2, ^bb1, ^bb2
+^bb1:  // pred: ^bb0
+  %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable
+  cf.br ^bb3(%executable : !hal.executable)
+^bb2:  // pred: ^bb0
+  util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+  cf.br ^bb3(%0 : !hal.executable)
+^bb3(%3: !hal.executable):  // 2 preds: ^bb1, ^bb2
+  util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  util.return
+}
+
+// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %c-1_i32 = arith.constant -1 : i32
+  %c0_i64 = arith.constant 0 : i64
+  %0 = util.null : !hal.fence
+  %c-1_i64 = arith.constant -1 : i64
+  %c0 = arith.constant 0 : index
+  %c128 = arith.constant 128 : index
+  %c64 = arith.constant 64 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32
+  %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
+  %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
+  hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
+  hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+  %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+  %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128}
+  %1 = util.call @__multiple_results_memoize_lookup(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer
+  %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+  hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%1) bindings([
+    (%buffer : !hal.buffer)[%c0, %c8], 
+    (%buffer_0 : !hal.buffer)[%c0, %c8], 
+    (%transient_buffer : !hal.buffer)[%c0, %c128]
+  ]) flags("None")
+  %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32
+  util.status.check_ok %status, "failed to wait on timepoint"
+  %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+  %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+  util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After CSE (cse) //----- //
+util.initializer {
+  %c18_i32 = arith.constant 18 : i32
+  %false = arith.constant false
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %0 = util.null : !hal.device
+  %device_count = hal.devices.count : index
+  cf.br ^bb1(%c0, %c0, %0 : index, index, !hal.device)
+^bb1(%1: index, %2: index, %3: !hal.device):  // 2 preds: ^bb0, ^bb4
+  %4 = util.cmp.eq %3, %0 : !hal.device
+  %5 = arith.cmpi slt, %1, %device_count : index
+  %6 = arith.andi %4, %5 : i1
+  cf.cond_br %6, ^bb2(%1, %2 : index, index), ^bb5
+^bb2(%7: index, %8: index):  // pred: ^bb1
+  %device_n = hal.devices.get %7 : !hal.device
+  %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
+  cf.cond_br %value, ^bb3, ^bb4(%false : i1)
+^bb3:  // pred: ^bb2
+  %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+  cf.br ^bb4(%value_1 : i1)
+^bb4(%9: i1):  // 2 preds: ^bb2, ^bb3
+  %10 = arith.cmpi eq, %8, %c0 : index
+  %11 = arith.select %9, %c1, %c0 : index
+  %12 = arith.addi %8, %11 : index
+  %13 = arith.andi %9, %10 : i1
+  %14 = arith.select %13, %device_n, %0 : !hal.device
+  %15 = arith.addi %7, %c1 : index
+  cf.br ^bb1(%15, %12, %14 : index, index, !hal.device)
+^bb5:  // pred: ^bb1
+  cf.cond_br %4, ^bb6, ^bb7
+^bb6:  // pred: ^bb5
+  util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>"
+  cf.br ^bb7
+^bb7:  // 2 preds: ^bb5, ^bb6
+  util.global.store %3, @__device_0 : !hal.device
+  util.return
+}
+
+// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- //
+util.initializer {
+  %__device_0 = util.global.load @__device_0 : !hal.device
+  %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+  util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  util.return
+}
+
+// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- //
+util.func private @__multiple_results_memoize_apply(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} {
+  %c3 = arith.constant 3 : index
+  %c0_i32 = arith.constant 0 : i32
+  %c0 = arith.constant 0 : index
+  %c2 = arith.constant 2 : index
+  %c8 = arith.constant 8 : index
+  %c128 = arith.constant 128 : index
+  %c64_i32 = arith.constant 64 : i32
+  %c1 = arith.constant 1 : index
+  %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  %cmd = hal.command_buffer.create device(%arg0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%arg1) bindings(%c3) : !hal.command_buffer
+  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([
+    (%c0 : index)[%c0, %c8], 
+    (%c2 : index)[%c0, %c128]
+  ]) flags("None")
+  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([
+    (%c1 : index)[%c0, %c8], 
+    (%c2 : index)[%c0, %c128]
+  ]) flags("None")
+  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
+  hal.command_buffer.finalize<%cmd : !hal.command_buffer>
+  util.return %cmd : !hal.command_buffer
+}
+
+// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32
+  %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32
+  %c2 = arith.constant 2 : index
+  %c8 = arith.constant 8 : index
+  %c64 = arith.constant 64 : index
+  %c128 = arith.constant 128 : index
+  %c0 = arith.constant 0 : index
+  %c-1_i64 = arith.constant -1 : i64
+  %0 = util.null : !hal.fence
+  %c0_i64 = arith.constant 0 : i64
+  %c-1_i32 = arith.constant -1 : i32
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
+  %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
+  hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
+  hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+  %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+  %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128}
+  %1 = util.call @__multiple_results_memoize_lookup(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer
+  %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+  hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%1) bindings([
+    (%buffer : !hal.buffer)[%c0, %c8], 
+    (%buffer_0 : !hal.buffer)[%c0, %c8], 
+    (%transient_buffer : !hal.buffer)[%c0, %c128]
+  ]) flags("None")
+  %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32
+  util.status.check_ok %status, "failed to wait on timepoint"
+  %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+  %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+  util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- //
+util.initializer {
+  %c18_i32 = arith.constant 18 : i32
+  %false = arith.constant false
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %0 = util.null : !hal.device
+  %device_count = hal.devices.count : index
+  cf.br ^bb1(%c0, %c0, %0 : index, index, !hal.device)
+^bb1(%1: index, %2: index, %3: !hal.device):  // 2 preds: ^bb0, ^bb4
+  %4 = util.cmp.eq %3, %0 : !hal.device
+  %5 = arith.cmpi slt, %1, %device_count : index
+  %6 = arith.andi %4, %5 : i1
+  cf.cond_br %6, ^bb2(%1, %2 : index, index), ^bb5
+^bb2(%7: index, %8: index):  // pred: ^bb1
+  %device_n = hal.devices.get %7 : !hal.device
+  %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
+  cf.cond_br %value, ^bb3, ^bb4(%false : i1)
+^bb3:  // pred: ^bb2
+  %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+  cf.br ^bb4(%value_1 : i1)
+^bb4(%9: i1):  // 2 preds: ^bb2, ^bb3
+  %10 = arith.cmpi eq, %8, %c0 : index
+  %11 = arith.select %9, %c1, %c0 : index
+  %12 = arith.addi %8, %11 : index
+  %13 = arith.andi %9, %10 : i1
+  %14 = arith.select %13, %device_n, %0 : !hal.device
+  %15 = arith.addi %7, %c1 : index
+  cf.br ^bb1(%15, %12, %14 : index, index, !hal.device)
+^bb5:  // pred: ^bb1
+  cf.cond_br %4, ^bb6, ^bb7
+^bb6:  // pred: ^bb5
+  util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>"
+  cf.br ^bb7
+^bb7:  // 2 preds: ^bb5, ^bb6
+  util.global.store %3, @__device_0 : !hal.device
+  util.return
+}
+
+// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- //
+util.initializer {
+  %c-1_i64 = arith.constant -1 : i64
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %0 = util.call @__multiple_results_memoize_apply(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer
+  util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  util.return
+}
+
+// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- //
+util.func private @__multiple_results_memoize_lookup(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer {
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  %0 = util.null : !hal.command_buffer
+  %1 = util.cmp.eq %arg0, %__device_0 : !hal.device
+  cf.cond_br %1, ^bb1, ^bb2(%0 : !hal.command_buffer)
+^bb1:  // pred: ^bb0
+  cf.br ^bb2(%__multiple_results_memoize_result_0_device_0 : !hal.command_buffer)
+^bb2(%2: !hal.command_buffer):  // 2 preds: ^bb0, ^bb1
+  util.return %2 : !hal.command_buffer
+}
+
+// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- //
+util.initializer {
+  %c18_i32 = arith.constant 18 : i32
+  %false = arith.constant false
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %0 = util.null : !hal.device
+  %device_count = hal.devices.count : index
+  cf.br ^bb1(%c0, %c0, %0 : index, index, !hal.device)
+^bb1(%1: index, %2: index, %3: !hal.device):  // 2 preds: ^bb0, ^bb4
+  %4 = util.cmp.eq %3, %0 : !hal.device
+  %5 = arith.cmpi slt, %1, %device_count : index
+  %6 = arith.andi %4, %5 : i1
+  cf.cond_br %6, ^bb2, ^bb5
+^bb2:  // pred: ^bb1
+  %device_n = hal.devices.get %1 : !hal.device
+  %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
+  cf.cond_br %value, ^bb3, ^bb4(%false : i1)
+^bb3:  // pred: ^bb2
+  %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+  cf.br ^bb4(%value_1 : i1)
+^bb4(%7: i1):  // 2 preds: ^bb2, ^bb3
+  %8 = arith.cmpi eq, %2, %c0 : index
+  %9 = arith.select %7, %c1, %c0 : index
+  %10 = arith.addi %2, %9 : index
+  %11 = arith.andi %7, %8 : i1
+  %12 = arith.select %11, %device_n, %0 : !hal.device
+  %13 = arith.addi %1, %c1 : index
+  cf.br ^bb1(%13, %10, %12 : index, index, !hal.device)
+^bb5:  // pred: ^bb1
+  cf.cond_br %4, ^bb6, ^bb7
+^bb6:  // pred: ^bb5
+  util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>"
+  cf.br ^bb7
+^bb7:  // 2 preds: ^bb5, ^bb6
+  util.global.store %3, @__device_0 : !hal.device
+  util.return
+}
+
+// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- //
+util.func private @__multiple_results_memoize_lookup(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer {
+  %0 = util.null : !hal.command_buffer
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  %1 = util.cmp.eq %arg0, %__device_0 : !hal.device
+  %2 = arith.select %1, %__multiple_results_memoize_result_0_device_0, %0 : !hal.command_buffer
+  util.return %2 : !hal.command_buffer
+}
+
+// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- //
+module attributes {iree.fixedpoint.iteration = 0 : index} {
+  util.global private @__device_0 : !hal.device
+  util.initializer {
+    %c18_i32 = arith.constant 18 : i32
+    %false = arith.constant false
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %0 = util.null : !hal.device
+    %device_count = hal.devices.count : index
+    cf.br ^bb1(%c0, %c0, %0 : index, index, !hal.device)
+  ^bb1(%1: index, %2: index, %3: !hal.device):  // 2 preds: ^bb0, ^bb4
+    %4 = util.cmp.eq %3, %0 : !hal.device
+    %5 = arith.cmpi slt, %1, %device_count : index
+    %6 = arith.andi %4, %5 : i1
+    cf.cond_br %6, ^bb2, ^bb5
+  ^bb2:  // pred: ^bb1
+    %device_n = hal.devices.get %1 : !hal.device
+    %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
+    cf.cond_br %value, ^bb3, ^bb4(%false : i1)
+  ^bb3:  // pred: ^bb2
+    %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+    cf.br ^bb4(%value_1 : i1)
+  ^bb4(%7: i1):  // 2 preds: ^bb2, ^bb3
+    %8 = arith.cmpi eq, %2, %c0 : index
+    %9 = arith.select %7, %c1, %c0 : index
+    %10 = arith.addi %2, %9 : index
+    %11 = arith.andi %7, %8 : i1
+    %12 = arith.select %11, %device_n, %0 : !hal.device
+    %13 = arith.addi %1, %c1 : index
+    cf.br ^bb1(%13, %10, %12 : index, index, !hal.device)
+  ^bb5:  // pred: ^bb1
+    cf.cond_br %4, ^bb6, ^bb7
+  ^bb6:  // pred: ^bb5
+    util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>"
+    cf.br ^bb7
+  ^bb7:  // 2 preds: ^bb5, ^bb6
+    util.global.store %3, @__device_0 : !hal.device
+    util.return
+  }
+  util.global private @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  util.initializer {
+    %__device_0 = util.global.load @__device_0 : !hal.device
+    %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+    util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+    util.return
+  }
+  util.global private @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  util.initializer {
+    %0 = util.null : !hal.executable
+    %c14_i32 = arith.constant 14 : i32
+    %c0 = arith.constant 0 : index
+    %c-1 = arith.constant -1 : index
+    %c-1_i64 = arith.constant -1 : i64
+    %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+    %__device_0 = util.global.load @__device_0 : !hal.device
+    %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index
+    %2 = arith.cmpi eq, %1, %c0 : index
+    cf.cond_br %2, ^bb1, ^bb2
+  ^bb1:  // pred: ^bb0
+    %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable
+    cf.br ^bb3(%executable : !hal.executable)
+  ^bb2:  // pred: ^bb0
+    util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+    cf.br ^bb3(%0 : !hal.executable)
+  ^bb3(%3: !hal.executable):  // 2 preds: ^bb1, ^bb2
+    util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+    util.return
+  }
+  hal.executable private @multiple_results_dispatch_0 {
+    hal.executable.binary public @embedded_elf_arm_64 attributes {data = dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8>, format = "embedded-elf-arm_64", mime_type = "application/x-elf"}
+  }
+  util.func private @__multiple_results_memoize_apply(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} {
+    %c3 = arith.constant 3 : index
+    %c0_i32 = arith.constant 0 : i32
+    %c0 = arith.constant 0 : index
+    %c2 = arith.constant 2 : index
+    %c8 = arith.constant 8 : index
+    %c128 = arith.constant 128 : index
+    %c64_i32 = arith.constant 64 : i32
+    %c1 = arith.constant 1 : index
+    %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+    %cmd = hal.command_buffer.create device(%arg0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%arg1) bindings(%c3) : !hal.command_buffer
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([
+      (%c0 : index)[%c0, %c8], 
+      (%c2 : index)[%c0, %c128]
+    ]) flags("None")
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([
+      (%c1 : index)[%c0, %c8], 
+      (%c2 : index)[%c0, %c128]
+    ]) flags("None")
+    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
+    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
+    util.return %cmd : !hal.command_buffer
+  }
+  util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  util.initializer {
+    %c-1_i64 = arith.constant -1 : i64
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %0 = util.call @__multiple_results_memoize_apply(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer
+    util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    util.return
+  }
+  util.func private @__multiple_results_memoize_lookup(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer {
+    %0 = util.null : !hal.command_buffer
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    %1 = util.cmp.eq %arg0, %__device_0 : !hal.device
+    %2 = arith.select %1, %__multiple_results_memoize_result_0_device_0, %0 : !hal.command_buffer
+    util.return %2 : !hal.command_buffer
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32
+    %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32
+    %c2 = arith.constant 2 : index
+    %c8 = arith.constant 8 : index
+    %c64 = arith.constant 64 : index
+    %c128 = arith.constant 128 : index
+    %c0 = arith.constant 0 : index
+    %c-1_i64 = arith.constant -1 : i64
+    %0 = util.null : !hal.fence
+    %c0_i64 = arith.constant 0 : i64
+    %c-1_i32 = arith.constant -1 : i32
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
+    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
+    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
+    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128}
+    %1 = util.call @__multiple_results_memoize_lookup(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer
+    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+    hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%1) bindings([
+      (%buffer : !hal.buffer)[%c0, %c8], 
+      (%buffer_0 : !hal.buffer)[%c0, %c8], 
+      (%transient_buffer : !hal.buffer)[%c0, %c128]
+    ]) flags("None")
+    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32
+    util.status.check_ok %status, "failed to wait on timepoint"
+    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+    %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+    util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- //
+module attributes {iree.fixedpoint.iteration = 0 : index} {
+  util.global private @__device_0 : !hal.device
+  util.initializer {
+    %c18_i32 = arith.constant 18 : i32
+    %false = arith.constant false
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %0 = util.null : !hal.device
+    %device_count = hal.devices.count : index
+    cf.br ^bb1(%c0, %c0, %0 : index, index, !hal.device)
+  ^bb1(%1: index, %2: index, %3: !hal.device):  // 2 preds: ^bb0, ^bb4
+    %4 = util.cmp.eq %3, %0 : !hal.device
+    %5 = arith.cmpi slt, %1, %device_count : index
+    %6 = arith.andi %4, %5 : i1
+    cf.cond_br %6, ^bb2, ^bb5
+  ^bb2:  // pred: ^bb1
+    %device_n = hal.devices.get %1 : !hal.device
+    %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
+    cf.cond_br %value, ^bb3, ^bb4(%false : i1)
+  ^bb3:  // pred: ^bb2
+    %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+    cf.br ^bb4(%value_1 : i1)
+  ^bb4(%7: i1):  // 2 preds: ^bb2, ^bb3
+    %8 = arith.cmpi eq, %2, %c0 : index
+    %9 = arith.select %7, %c1, %c0 : index
+    %10 = arith.addi %2, %9 : index
+    %11 = arith.andi %7, %8 : i1
+    %12 = arith.select %11, %device_n, %0 : !hal.device
+    %13 = arith.addi %1, %c1 : index
+    cf.br ^bb1(%13, %10, %12 : index, index, !hal.device)
+  ^bb5:  // pred: ^bb1
+    cf.cond_br %4, ^bb6, ^bb7
+  ^bb6:  // pred: ^bb5
+    util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>"
+    cf.br ^bb7
+  ^bb7:  // 2 preds: ^bb5, ^bb6
+    util.global.store %3, @__device_0 : !hal.device
+    util.return
+  }
+  util.global private @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  util.initializer {
+    %__device_0 = util.global.load @__device_0 : !hal.device
+    %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+    util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+    util.return
+  }
+  util.global private @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  util.initializer {
+    %0 = util.null : !hal.executable
+    %c14_i32 = arith.constant 14 : i32
+    %c0 = arith.constant 0 : index
+    %c-1 = arith.constant -1 : index
+    %c-1_i64 = arith.constant -1 : i64
+    %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+    %__device_0 = util.global.load @__device_0 : !hal.device
+    %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index
+    %2 = arith.cmpi eq, %1, %c0 : index
+    cf.cond_br %2, ^bb1, ^bb2
+  ^bb1:  // pred: ^bb0
+    %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable
+    cf.br ^bb3(%executable : !hal.executable)
+  ^bb2:  // pred: ^bb0
+    util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+    cf.br ^bb3(%0 : !hal.executable)
+  ^bb3(%3: !hal.executable):  // 2 preds: ^bb1, ^bb2
+    util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+    util.return
+  }
+  hal.executable private @multiple_results_dispatch_0 {
+    hal.executable.binary public @embedded_elf_arm_64 attributes {data = dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8>, format = "embedded-elf-arm_64", mime_type = "application/x-elf"}
+  }
+  util.func private @__multiple_results_memoize_apply(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} {
+    %c3 = arith.constant 3 : index
+    %c0_i32 = arith.constant 0 : i32
+    %c0 = arith.constant 0 : index
+    %c2 = arith.constant 2 : index
+    %c8 = arith.constant 8 : index
+    %c128 = arith.constant 128 : index
+    %c64_i32 = arith.constant 64 : i32
+    %c1 = arith.constant 1 : index
+    %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+    %cmd = hal.command_buffer.create device(%arg0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%arg1) bindings(%c3) : !hal.command_buffer
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([
+      (%c0 : index)[%c0, %c8], 
+      (%c2 : index)[%c0, %c128]
+    ]) flags("None")
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([
+      (%c1 : index)[%c0, %c8], 
+      (%c2 : index)[%c0, %c128]
+    ]) flags("None")
+    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
+    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
+    util.return %cmd : !hal.command_buffer
+  }
+  util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  util.initializer {
+    %c-1_i64 = arith.constant -1 : i64
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %0 = util.call @__multiple_results_memoize_apply(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer
+    util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    util.return
+  }
+  util.func private @__multiple_results_memoize_lookup(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer {
+    %0 = util.null : !hal.command_buffer
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    %1 = util.cmp.eq %arg0, %__device_0 : !hal.device
+    %2 = arith.select %1, %__multiple_results_memoize_result_0_device_0, %0 : !hal.command_buffer
+    util.return %2 : !hal.command_buffer
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32
+    %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32
+    %c2 = arith.constant 2 : index
+    %c8 = arith.constant 8 : index
+    %c64 = arith.constant 64 : index
+    %c128 = arith.constant 128 : index
+    %c0 = arith.constant 0 : index
+    %c-1_i64 = arith.constant -1 : i64
+    %0 = util.null : !hal.fence
+    %c0_i64 = arith.constant 0 : i64
+    %c-1_i32 = arith.constant -1 : i32
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
+    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
+    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
+    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128}
+    %1 = util.call @__multiple_results_memoize_lookup(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer
+    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+    hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%1) bindings([
+      (%buffer : !hal.buffer)[%c0, %c8], 
+      (%buffer_0 : !hal.buffer)[%c0, %c8], 
+      (%transient_buffer : !hal.buffer)[%c0, %c128]
+    ]) flags("None")
+    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32
+    util.status.check_ok %status, "failed to wait on timepoint"
+    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+    %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+    util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After IPOPass (iree-util-ipo) //----- //
+module attributes {iree.fixedpoint.iteration = 0 : index, iree.fixedpoint.modified} {
+  util.global private @__device_0 : !hal.device
+  util.initializer {
+    %c18_i32 = arith.constant 18 : i32
+    %false = arith.constant false
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %0 = util.null : !hal.device
+    %device_count = hal.devices.count : index
+    cf.br ^bb1(%c0, %c0, %0 : index, index, !hal.device)
+  ^bb1(%1: index, %2: index, %3: !hal.device):  // 2 preds: ^bb0, ^bb4
+    %4 = util.cmp.eq %3, %0 : !hal.device
+    %5 = arith.cmpi slt, %1, %device_count : index
+    %6 = arith.andi %4, %5 : i1
+    cf.cond_br %6, ^bb2, ^bb5
+  ^bb2:  // pred: ^bb1
+    %device_n = hal.devices.get %1 : !hal.device
+    %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
+    cf.cond_br %value, ^bb3, ^bb4(%false : i1)
+  ^bb3:  // pred: ^bb2
+    %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+    cf.br ^bb4(%value_1 : i1)
+  ^bb4(%7: i1):  // 2 preds: ^bb2, ^bb3
+    %8 = arith.cmpi eq, %2, %c0 : index
+    %9 = arith.select %7, %c1, %c0 : index
+    %10 = arith.addi %2, %9 : index
+    %11 = arith.andi %7, %8 : i1
+    %12 = arith.select %11, %device_n, %0 : !hal.device
+    %13 = arith.addi %1, %c1 : index
+    cf.br ^bb1(%13, %10, %12 : index, index, !hal.device)
+  ^bb5:  // pred: ^bb1
+    cf.cond_br %4, ^bb6, ^bb7
+  ^bb6:  // pred: ^bb5
+    util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>"
+    cf.br ^bb7
+  ^bb7:  // 2 preds: ^bb5, ^bb6
+    util.global.store %3, @__device_0 : !hal.device
+    util.return
+  }
+  util.global private @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  util.initializer {
+    %__device_0 = util.global.load @__device_0 : !hal.device
+    %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+    util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+    util.return
+  }
+  util.global private @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  util.initializer {
+    %0 = util.null : !hal.executable
+    %c14_i32 = arith.constant 14 : i32
+    %c0 = arith.constant 0 : index
+    %c-1 = arith.constant -1 : index
+    %c-1_i64 = arith.constant -1 : i64
+    %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+    %__device_0 = util.global.load @__device_0 : !hal.device
+    %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index
+    %2 = arith.cmpi eq, %1, %c0 : index
+    cf.cond_br %2, ^bb1, ^bb2
+  ^bb1:  // pred: ^bb0
+    %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable
+    cf.br ^bb3(%executable : !hal.executable)
+  ^bb2:  // pred: ^bb0
+    util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+    cf.br ^bb3(%0 : !hal.executable)
+  ^bb3(%3: !hal.executable):  // 2 preds: ^bb1, ^bb2
+    util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+    util.return
+  }
+  hal.executable private @multiple_results_dispatch_0 {
+    hal.executable.binary public @embedded_elf_arm_64 attributes {data = dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8>, format = "embedded-elf-arm_64", mime_type = "application/x-elf"}
+  }
+  util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} {
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %c-1_i64 = arith.constant -1 : i64
+    %c3 = arith.constant 3 : index
+    %c0_i32 = arith.constant 0 : i32
+    %c0 = arith.constant 0 : index
+    %c2 = arith.constant 2 : index
+    %c8 = arith.constant 8 : index
+    %c128 = arith.constant 128 : index
+    %c64_i32 = arith.constant 64 : i32
+    %c1 = arith.constant 1 : index
+    %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([
+      (%c0 : index)[%c0, %c8], 
+      (%c2 : index)[%c0, %c128]
+    ]) flags("None")
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([
+      (%c1 : index)[%c0, %c8], 
+      (%c2 : index)[%c0, %c128]
+    ]) flags("None")
+    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
+    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
+    util.return %cmd : !hal.command_buffer
+  }
+  util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  util.initializer {
+    %c-1_i64 = arith.constant -1 : i64
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %0 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer
+    util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    util.return
+  }
+  util.func private @__multiple_results_memoize_lookup() -> !hal.command_buffer {
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %0 = util.null : !hal.command_buffer
+    %__device_0_0 = util.global.load immutable @__device_0 : !hal.device
+    %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    %1 = util.cmp.eq %__device_0, %__device_0_0 : !hal.device
+    %2 = arith.select %1, %__multiple_results_memoize_result_0_device_0, %0 : !hal.command_buffer
+    util.return %2 : !hal.command_buffer
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32
+    %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32
+    %c2 = arith.constant 2 : index
+    %c8 = arith.constant 8 : index
+    %c64 = arith.constant 64 : index
+    %c128 = arith.constant 128 : index
+    %c0 = arith.constant 0 : index
+    %c-1_i64 = arith.constant -1 : i64
+    %0 = util.null : !hal.fence
+    %c0_i64 = arith.constant 0 : i64
+    %c-1_i32 = arith.constant -1 : i32
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
+    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
+    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
+    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128}
+    %1 = util.call @__multiple_results_memoize_lookup() : () -> !hal.command_buffer
+    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+    hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%1) bindings([
+      (%buffer : !hal.buffer)[%c0, %c8], 
+      (%buffer_0 : !hal.buffer)[%c0, %c8], 
+      (%transient_buffer : !hal.buffer)[%c0, %c128]
+    ]) flags("None")
+    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32
+    util.status.check_ok %status, "failed to wait on timepoint"
+    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+    %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+    util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+util.initializer {
+  %__device_0 = util.global.load @__device_0 : !hal.device
+  %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+  util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  util.return
+}
+
+// -----// IR Dump After CSE (cse) //----- //
+util.initializer {
+  %__device_0 = util.global.load @__device_0 : !hal.device
+  %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+  util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  util.return
+}
+
+// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- //
+util.initializer {
+  %__device_0 = util.global.load @__device_0 : !hal.device
+  %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+  util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  util.return
+}
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+util.initializer {
+  %c18_i32 = arith.constant 18 : i32
+  %false = arith.constant false
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %0 = util.null : !hal.device
+  %device_count = hal.devices.count : index
+  cf.br ^bb1(%c0, %c0, %0 : index, index, !hal.device)
+^bb1(%1: index, %2: index, %3: !hal.device):  // 2 preds: ^bb0, ^bb4
+  %4 = util.cmp.eq %3, %0 : !hal.device
+  %5 = arith.cmpi slt, %1, %device_count : index
+  %6 = arith.andi %4, %5 : i1
+  cf.cond_br %6, ^bb2, ^bb5
+^bb2:  // pred: ^bb1
+  %device_n = hal.devices.get %1 : !hal.device
+  %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
+  cf.cond_br %value, ^bb3, ^bb4(%false : i1)
+^bb3:  // pred: ^bb2
+  %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+  cf.br ^bb4(%value_1 : i1)
+^bb4(%7: i1):  // 2 preds: ^bb2, ^bb3
+  %8 = arith.cmpi eq, %2, %c0 : index
+  %9 = arith.select %7, %c1, %c0 : index
+  %10 = arith.addi %2, %9 : index
+  %11 = arith.andi %7, %8 : i1
+  %12 = arith.select %11, %device_n, %0 : !hal.device
+  %13 = arith.addi %1, %c1 : index
+  cf.br ^bb1(%13, %10, %12 : index, index, !hal.device)
+^bb5:  // pred: ^bb1
+  cf.cond_br %4, ^bb6, ^bb7
+^bb6:  // pred: ^bb5
+  util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>"
+  cf.br ^bb7
+^bb7:  // 2 preds: ^bb5, ^bb6
+  util.global.store %3, @__device_0 : !hal.device
+  util.return
+}
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} {
+  %c1 = arith.constant 1 : index
+  %c64_i32 = arith.constant 64 : i32
+  %c128 = arith.constant 128 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %c0 = arith.constant 0 : index
+  %c0_i32 = arith.constant 0 : i32
+  %c3 = arith.constant 3 : index
+  %c-1_i64 = arith.constant -1 : i64
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer
+  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([
+    (%c0 : index)[%c0, %c8], 
+    (%c2 : index)[%c0, %c128]
+  ]) flags("None")
+  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([
+    (%c1 : index)[%c0, %c8], 
+    (%c2 : index)[%c0, %c128]
+  ]) flags("None")
+  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
+  hal.command_buffer.finalize<%cmd : !hal.command_buffer>
+  util.return %cmd : !hal.command_buffer
+}
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+util.initializer {
+  %0 = util.null : !hal.executable
+  %c14_i32 = arith.constant 14 : i32
+  %c0 = arith.constant 0 : index
+  %c-1 = arith.constant -1 : index
+  %c-1_i64 = arith.constant -1 : i64
+  %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  %__device_0 = util.global.load @__device_0 : !hal.device
+  %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index
+  %2 = arith.cmpi eq, %1, %c0 : index
+  cf.cond_br %2, ^bb1, ^bb2
+^bb1:  // pred: ^bb0
+  %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable
+  cf.br ^bb3(%executable : !hal.executable)
+^bb2:  // pred: ^bb0
+  util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+  cf.br ^bb3(%0 : !hal.executable)
+^bb3(%3: !hal.executable):  // 2 preds: ^bb1, ^bb2
+  util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  util.return
+}
+
+// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- //
+util.initializer {
+  %__device_0 = util.global.load @__device_0 : !hal.device
+  %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+  util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  util.return
+}
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+util.func private @__multiple_results_memoize_lookup() -> !hal.command_buffer {
+  %0 = util.null : !hal.command_buffer
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %__device_0_0 = util.global.load immutable @__device_0 : !hal.device
+  %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  %1 = util.cmp.eq %__device_0, %__device_0_0 : !hal.device
+  %2 = arith.select %1, %__multiple_results_memoize_result_0_device_0, %0 : !hal.command_buffer
+  util.return %2 : !hal.command_buffer
+}
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32
+  %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32
+  %c2 = arith.constant 2 : index
+  %c8 = arith.constant 8 : index
+  %c64 = arith.constant 64 : index
+  %c128 = arith.constant 128 : index
+  %c0 = arith.constant 0 : index
+  %c-1_i64 = arith.constant -1 : i64
+  %0 = util.null : !hal.fence
+  %c0_i64 = arith.constant 0 : i64
+  %c-1_i32 = arith.constant -1 : i32
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
+  %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
+  hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
+  hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+  %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+  %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128}
+  %1 = util.call @__multiple_results_memoize_lookup() : () -> !hal.command_buffer
+  %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+  hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%1) bindings([
+    (%buffer : !hal.buffer)[%c0, %c8], 
+    (%buffer_0 : !hal.buffer)[%c0, %c8], 
+    (%transient_buffer : !hal.buffer)[%c0, %c128]
+  ]) flags("None")
+  %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32
+  util.status.check_ok %status, "failed to wait on timepoint"
+  %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+  %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+  util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After CSE (cse) //----- //
+util.initializer {
+  %c18_i32 = arith.constant 18 : i32
+  %false = arith.constant false
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %0 = util.null : !hal.device
+  %device_count = hal.devices.count : index
+  cf.br ^bb1(%c0, %c0, %0 : index, index, !hal.device)
+^bb1(%1: index, %2: index, %3: !hal.device):  // 2 preds: ^bb0, ^bb4
+  %4 = util.cmp.eq %3, %0 : !hal.device
+  %5 = arith.cmpi slt, %1, %device_count : index
+  %6 = arith.andi %4, %5 : i1
+  cf.cond_br %6, ^bb2, ^bb5
+^bb2:  // pred: ^bb1
+  %device_n = hal.devices.get %1 : !hal.device
+  %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
+  cf.cond_br %value, ^bb3, ^bb4(%false : i1)
+^bb3:  // pred: ^bb2
+  %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+  cf.br ^bb4(%value_1 : i1)
+^bb4(%7: i1):  // 2 preds: ^bb2, ^bb3
+  %8 = arith.cmpi eq, %2, %c0 : index
+  %9 = arith.select %7, %c1, %c0 : index
+  %10 = arith.addi %2, %9 : index
+  %11 = arith.andi %7, %8 : i1
+  %12 = arith.select %11, %device_n, %0 : !hal.device
+  %13 = arith.addi %1, %c1 : index
+  cf.br ^bb1(%13, %10, %12 : index, index, !hal.device)
+^bb5:  // pred: ^bb1
+  cf.cond_br %4, ^bb6, ^bb7
+^bb6:  // pred: ^bb5
+  util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>"
+  cf.br ^bb7
+^bb7:  // 2 preds: ^bb5, ^bb6
+  util.global.store %3, @__device_0 : !hal.device
+  util.return
+}
+
+// -----// IR Dump After CSE (cse) //----- //
+util.initializer {
+  %0 = util.null : !hal.executable
+  %c14_i32 = arith.constant 14 : i32
+  %c0 = arith.constant 0 : index
+  %c-1 = arith.constant -1 : index
+  %c-1_i64 = arith.constant -1 : i64
+  %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  %__device_0 = util.global.load @__device_0 : !hal.device
+  %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index
+  %2 = arith.cmpi eq, %1, %c0 : index
+  cf.cond_br %2, ^bb1, ^bb2
+^bb1:  // pred: ^bb0
+  %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable
+  cf.br ^bb3(%executable : !hal.executable)
+^bb2:  // pred: ^bb0
+  util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+  cf.br ^bb3(%0 : !hal.executable)
+^bb3(%3: !hal.executable):  // 2 preds: ^bb1, ^bb2
+  util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  util.return
+}
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+util.initializer {
+  %0 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer
+  util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  util.return
+}
+
+// -----// IR Dump After CSE (cse) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32
+  %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32
+  %c2 = arith.constant 2 : index
+  %c8 = arith.constant 8 : index
+  %c64 = arith.constant 64 : index
+  %c128 = arith.constant 128 : index
+  %c0 = arith.constant 0 : index
+  %c-1_i64 = arith.constant -1 : i64
+  %0 = util.null : !hal.fence
+  %c0_i64 = arith.constant 0 : i64
+  %c-1_i32 = arith.constant -1 : i32
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
+  %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
+  hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
+  hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+  %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+  %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128}
+  %1 = util.call @__multiple_results_memoize_lookup() : () -> !hal.command_buffer
+  %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+  hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%1) bindings([
+    (%buffer : !hal.buffer)[%c0, %c8], 
+    (%buffer_0 : !hal.buffer)[%c0, %c8], 
+    (%transient_buffer : !hal.buffer)[%c0, %c128]
+  ]) flags("None")
+  %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32
+  util.status.check_ok %status, "failed to wait on timepoint"
+  %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+  %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+  util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After CSE (cse) //----- //
+util.func private @__multiple_results_memoize_lookup() -> !hal.command_buffer {
+  %0 = util.null : !hal.command_buffer
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  %1 = util.cmp.eq %__device_0, %__device_0 : !hal.device
+  %2 = arith.select %1, %__multiple_results_memoize_result_0_device_0, %0 : !hal.command_buffer
+  util.return %2 : !hal.command_buffer
+}
+
+// -----// IR Dump After CSE (cse) //----- //
+util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} {
+  %c1 = arith.constant 1 : index
+  %c64_i32 = arith.constant 64 : i32
+  %c128 = arith.constant 128 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %c0 = arith.constant 0 : index
+  %c0_i32 = arith.constant 0 : i32
+  %c3 = arith.constant 3 : index
+  %c-1_i64 = arith.constant -1 : i64
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer
+  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([
+    (%c0 : index)[%c0, %c8], 
+    (%c2 : index)[%c0, %c128]
+  ]) flags("None")
+  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([
+    (%c1 : index)[%c0, %c8], 
+    (%c2 : index)[%c0, %c128]
+  ]) flags("None")
+  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
+  hal.command_buffer.finalize<%cmd : !hal.command_buffer>
+  util.return %cmd : !hal.command_buffer
+}
+
+// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- //
+util.initializer {
+  %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  %__device_0 = util.global.load @__device_0 : !hal.device
+  %0 = util.null : !hal.executable
+  %c14_i32 = arith.constant 14 : i32
+  %c0 = arith.constant 0 : index
+  %c-1 = arith.constant -1 : index
+  %c-1_i64 = arith.constant -1 : i64
+  %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index
+  %2 = arith.cmpi eq, %1, %c0 : index
+  cf.cond_br %2, ^bb1, ^bb2
+^bb1:  // pred: ^bb0
+  %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable
+  cf.br ^bb3(%executable : !hal.executable)
+^bb2:  // pred: ^bb0
+  util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+  cf.br ^bb3(%0 : !hal.executable)
+^bb3(%3: !hal.executable):  // 2 preds: ^bb1, ^bb2
+  util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  util.return
+}
+
+// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32
+  %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32
+  %c2 = arith.constant 2 : index
+  %c8 = arith.constant 8 : index
+  %c64 = arith.constant 64 : index
+  %c128 = arith.constant 128 : index
+  %c0 = arith.constant 0 : index
+  %c-1_i64 = arith.constant -1 : i64
+  %0 = util.null : !hal.fence
+  %c0_i64 = arith.constant 0 : i64
+  %c-1_i32 = arith.constant -1 : i32
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
+  %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
+  hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
+  hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+  %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+  %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128}
+  %1 = util.call @__multiple_results_memoize_lookup() : () -> !hal.command_buffer
+  %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+  hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%1) bindings([
+    (%buffer : !hal.buffer)[%c0, %c8], 
+    (%buffer_0 : !hal.buffer)[%c0, %c8], 
+    (%transient_buffer : !hal.buffer)[%c0, %c128]
+  ]) flags("None")
+  %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32
+  util.status.check_ok %status, "failed to wait on timepoint"
+  %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+  %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+  util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- //
+util.initializer {
+  %c18_i32 = arith.constant 18 : i32
+  %false = arith.constant false
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %0 = util.null : !hal.device
+  %device_count = hal.devices.count : index
+  cf.br ^bb1(%c0, %c0, %0 : index, index, !hal.device)
+^bb1(%1: index, %2: index, %3: !hal.device):  // 2 preds: ^bb0, ^bb4
+  %4 = util.cmp.eq %3, %0 : !hal.device
+  %5 = arith.cmpi slt, %1, %device_count : index
+  %6 = arith.andi %4, %5 : i1
+  cf.cond_br %6, ^bb2, ^bb5
+^bb2:  // pred: ^bb1
+  %device_n = hal.devices.get %1 : !hal.device
+  %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
+  cf.cond_br %value, ^bb3, ^bb4(%false : i1)
+^bb3:  // pred: ^bb2
+  %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+  cf.br ^bb4(%value_1 : i1)
+^bb4(%7: i1):  // 2 preds: ^bb2, ^bb3
+  %8 = arith.cmpi eq, %2, %c0 : index
+  %9 = arith.select %7, %c1, %c0 : index
+  %10 = arith.addi %2, %9 : index
+  %11 = arith.andi %7, %8 : i1
+  %12 = arith.select %11, %device_n, %0 : !hal.device
+  %13 = arith.addi %1, %c1 : index
+  cf.br ^bb1(%13, %10, %12 : index, index, !hal.device)
+^bb5:  // pred: ^bb1
+  cf.cond_br %4, ^bb6, ^bb7
+^bb6:  // pred: ^bb5
+  util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>"
+  cf.br ^bb7
+^bb7:  // 2 preds: ^bb5, ^bb6
+  util.global.store %3, @__device_0 : !hal.device
+  util.return
+}
+
+// -----// IR Dump After CSE (cse) //----- //
+util.initializer {
+  %0 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer
+  util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  util.return
+}
+
+// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- //
+util.initializer {
+  %0 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer
+  util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  util.return
+}
+
+// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- //
+util.func private @__multiple_results_memoize_lookup() -> !hal.command_buffer {
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  %0 = util.null : !hal.command_buffer
+  %1 = util.cmp.eq %__device_0, %__device_0 : !hal.device
+  %2 = arith.select %1, %__multiple_results_memoize_result_0_device_0, %0 : !hal.command_buffer
+  util.return %2 : !hal.command_buffer
+}
+
+// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- //
+util.initializer {
+  %c-1_i64 = arith.constant -1 : i64
+  %c-1 = arith.constant -1 : index
+  %c0 = arith.constant 0 : index
+  %c14_i32 = arith.constant 14 : i32
+  %0 = util.null : !hal.executable
+  %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  %__device_0 = util.global.load @__device_0 : !hal.device
+  %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index
+  %2 = arith.cmpi eq, %1, %c0 : index
+  cf.cond_br %2, ^bb1, ^bb2
+^bb1:  // pred: ^bb0
+  %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable
+  cf.br ^bb3(%executable : !hal.executable)
+^bb2:  // pred: ^bb0
+  util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+  cf.br ^bb3(%0 : !hal.executable)
+^bb3(%3: !hal.executable):  // 2 preds: ^bb1, ^bb2
+  util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  util.return
+}
+
+// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c-1_i32 = arith.constant -1 : i32
+  %c0_i64 = arith.constant 0 : i64
+  %0 = util.null : !hal.fence
+  %c-1_i64 = arith.constant -1 : i64
+  %c0 = arith.constant 0 : index
+  %c128 = arith.constant 128 : index
+  %c64 = arith.constant 64 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32
+  %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
+  %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
+  hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
+  hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+  %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+  %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128}
+  %1 = util.call @__multiple_results_memoize_lookup() : () -> !hal.command_buffer
+  %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+  hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%1) bindings([
+    (%buffer : !hal.buffer)[%c0, %c8], 
+    (%buffer_0 : !hal.buffer)[%c0, %c8], 
+    (%transient_buffer : !hal.buffer)[%c0, %c128]
+  ]) flags("None")
+  %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32
+  util.status.check_ok %status, "failed to wait on timepoint"
+  %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+  %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+  util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- //
+util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} {
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  %c1 = arith.constant 1 : index
+  %c64_i32 = arith.constant 64 : i32
+  %c128 = arith.constant 128 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %c0 = arith.constant 0 : index
+  %c0_i32 = arith.constant 0 : i32
+  %c3 = arith.constant 3 : index
+  %c-1_i64 = arith.constant -1 : i64
+  %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer
+  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([
+    (%c0 : index)[%c0, %c8], 
+    (%c2 : index)[%c0, %c128]
+  ]) flags("None")
+  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([
+    (%c1 : index)[%c0, %c8], 
+    (%c2 : index)[%c0, %c128]
+  ]) flags("None")
+  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
+  hal.command_buffer.finalize<%cmd : !hal.command_buffer>
+  util.return %cmd : !hal.command_buffer
+}
+
+// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- //
+util.func private @__multiple_results_memoize_lookup() -> !hal.command_buffer {
+  %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  util.return %__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+}
+
+// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- //
+util.initializer {
+  %0 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer
+  util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  util.return
+}
+
+// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- //
+util.initializer {
+  %c18_i32 = arith.constant 18 : i32
+  %false = arith.constant false
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %0 = util.null : !hal.device
+  %device_count = hal.devices.count : index
+  cf.br ^bb1(%c0, %c0, %0 : index, index, !hal.device)
+^bb1(%1: index, %2: index, %3: !hal.device):  // 2 preds: ^bb0, ^bb4
+  %4 = util.cmp.eq %3, %0 : !hal.device
+  %5 = arith.cmpi slt, %1, %device_count : index
+  %6 = arith.andi %4, %5 : i1
+  cf.cond_br %6, ^bb2, ^bb5
+^bb2:  // pred: ^bb1
+  %device_n = hal.devices.get %1 : !hal.device
+  %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
+  cf.cond_br %value, ^bb3, ^bb4(%false : i1)
+^bb3:  // pred: ^bb2
+  %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+  cf.br ^bb4(%value_1 : i1)
+^bb4(%7: i1):  // 2 preds: ^bb2, ^bb3
+  %8 = arith.cmpi eq, %2, %c0 : index
+  %9 = arith.select %7, %c1, %c0 : index
+  %10 = arith.addi %2, %9 : index
+  %11 = arith.andi %7, %8 : i1
+  %12 = arith.select %11, %device_n, %0 : !hal.device
+  %13 = arith.addi %1, %c1 : index
+  cf.br ^bb1(%13, %10, %12 : index, index, !hal.device)
+^bb5:  // pred: ^bb1
+  cf.cond_br %4, ^bb6, ^bb7
+^bb6:  // pred: ^bb5
+  util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>"
+  cf.br ^bb7
+^bb7:  // 2 preds: ^bb5, ^bb6
+  util.global.store %3, @__device_0 : !hal.device
+  util.return
+}
+
+// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- //
+util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} {
+  %c-1_i64 = arith.constant -1 : i64
+  %c3 = arith.constant 3 : index
+  %c0_i32 = arith.constant 0 : i32
+  %c0 = arith.constant 0 : index
+  %c2 = arith.constant 2 : index
+  %c8 = arith.constant 8 : index
+  %c128 = arith.constant 128 : index
+  %c64_i32 = arith.constant 64 : i32
+  %c1 = arith.constant 1 : index
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer
+  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([
+    (%c0 : index)[%c0, %c8], 
+    (%c2 : index)[%c0, %c128]
+  ]) flags("None")
+  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([
+    (%c1 : index)[%c0, %c8], 
+    (%c2 : index)[%c0, %c128]
+  ]) flags("None")
+  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
+  hal.command_buffer.finalize<%cmd : !hal.command_buffer>
+  util.return %cmd : !hal.command_buffer
+}
+
+// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- //
+module attributes {iree.fixedpoint.iteration = 1 : index} {
+  util.global private @__device_0 : !hal.device
+  util.initializer {
+    %c18_i32 = arith.constant 18 : i32
+    %false = arith.constant false
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %0 = util.null : !hal.device
+    %device_count = hal.devices.count : index
+    cf.br ^bb1(%c0, %c0, %0 : index, index, !hal.device)
+  ^bb1(%1: index, %2: index, %3: !hal.device):  // 2 preds: ^bb0, ^bb4
+    %4 = util.cmp.eq %3, %0 : !hal.device
+    %5 = arith.cmpi slt, %1, %device_count : index
+    %6 = arith.andi %4, %5 : i1
+    cf.cond_br %6, ^bb2, ^bb5
+  ^bb2:  // pred: ^bb1
+    %device_n = hal.devices.get %1 : !hal.device
+    %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
+    cf.cond_br %value, ^bb3, ^bb4(%false : i1)
+  ^bb3:  // pred: ^bb2
+    %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+    cf.br ^bb4(%value_1 : i1)
+  ^bb4(%7: i1):  // 2 preds: ^bb2, ^bb3
+    %8 = arith.cmpi eq, %2, %c0 : index
+    %9 = arith.select %7, %c1, %c0 : index
+    %10 = arith.addi %2, %9 : index
+    %11 = arith.andi %7, %8 : i1
+    %12 = arith.select %11, %device_n, %0 : !hal.device
+    %13 = arith.addi %1, %c1 : index
+    cf.br ^bb1(%13, %10, %12 : index, index, !hal.device)
+  ^bb5:  // pred: ^bb1
+    cf.cond_br %4, ^bb6, ^bb7
+  ^bb6:  // pred: ^bb5
+    util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>"
+    cf.br ^bb7
+  ^bb7:  // 2 preds: ^bb5, ^bb6
+    util.global.store %3, @__device_0 : !hal.device
+    util.return
+  }
+  util.global private @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  util.initializer {
+    %__device_0 = util.global.load @__device_0 : !hal.device
+    %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+    util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+    util.return
+  }
+  util.global private @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  util.initializer {
+    %c-1_i64 = arith.constant -1 : i64
+    %c-1 = arith.constant -1 : index
+    %c0 = arith.constant 0 : index
+    %c14_i32 = arith.constant 14 : i32
+    %0 = util.null : !hal.executable
+    %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+    %__device_0 = util.global.load @__device_0 : !hal.device
+    %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index
+    %2 = arith.cmpi eq, %1, %c0 : index
+    cf.cond_br %2, ^bb1, ^bb2
+  ^bb1:  // pred: ^bb0
+    %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable
+    cf.br ^bb3(%executable : !hal.executable)
+  ^bb2:  // pred: ^bb0
+    util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+    cf.br ^bb3(%0 : !hal.executable)
+  ^bb3(%3: !hal.executable):  // 2 preds: ^bb1, ^bb2
+    util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+    util.return
+  }
+  hal.executable private @multiple_results_dispatch_0 {
+    hal.executable.binary public @embedded_elf_arm_64 attributes {data = dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8>, format = "embedded-elf-arm_64", mime_type = "application/x-elf"}
+  }
+  util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} {
+    %c-1_i64 = arith.constant -1 : i64
+    %c3 = arith.constant 3 : index
+    %c0_i32 = arith.constant 0 : i32
+    %c0 = arith.constant 0 : index
+    %c2 = arith.constant 2 : index
+    %c8 = arith.constant 8 : index
+    %c128 = arith.constant 128 : index
+    %c64_i32 = arith.constant 64 : i32
+    %c1 = arith.constant 1 : index
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([
+      (%c0 : index)[%c0, %c8], 
+      (%c2 : index)[%c0, %c128]
+    ]) flags("None")
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([
+      (%c1 : index)[%c0, %c8], 
+      (%c2 : index)[%c0, %c128]
+    ]) flags("None")
+    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
+    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
+    util.return %cmd : !hal.command_buffer
+  }
+  util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  util.initializer {
+    %0 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer
+    util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    util.return
+  }
+  util.func private @__multiple_results_memoize_lookup() -> !hal.command_buffer {
+    %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    util.return %__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c-1_i32 = arith.constant -1 : i32
+    %c0_i64 = arith.constant 0 : i64
+    %0 = util.null : !hal.fence
+    %c-1_i64 = arith.constant -1 : i64
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c64 = arith.constant 64 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32
+    %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
+    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
+    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
+    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128}
+    %1 = util.call @__multiple_results_memoize_lookup() : () -> !hal.command_buffer
+    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+    hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%1) bindings([
+      (%buffer : !hal.buffer)[%c0, %c8], 
+      (%buffer_0 : !hal.buffer)[%c0, %c8], 
+      (%transient_buffer : !hal.buffer)[%c0, %c128]
+    ]) flags("None")
+    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32
+    util.status.check_ok %status, "failed to wait on timepoint"
+    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+    %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+    util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- //
+module attributes {iree.fixedpoint.iteration = 1 : index} {
+  util.global private @__device_0 : !hal.device
+  util.initializer {
+    %c18_i32 = arith.constant 18 : i32
+    %false = arith.constant false
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %0 = util.null : !hal.device
+    %device_count = hal.devices.count : index
+    cf.br ^bb1(%c0, %c0, %0 : index, index, !hal.device)
+  ^bb1(%1: index, %2: index, %3: !hal.device):  // 2 preds: ^bb0, ^bb4
+    %4 = util.cmp.eq %3, %0 : !hal.device
+    %5 = arith.cmpi slt, %1, %device_count : index
+    %6 = arith.andi %4, %5 : i1
+    cf.cond_br %6, ^bb2, ^bb5
+  ^bb2:  // pred: ^bb1
+    %device_n = hal.devices.get %1 : !hal.device
+    %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
+    cf.cond_br %value, ^bb3, ^bb4(%false : i1)
+  ^bb3:  // pred: ^bb2
+    %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+    cf.br ^bb4(%value_1 : i1)
+  ^bb4(%7: i1):  // 2 preds: ^bb2, ^bb3
+    %8 = arith.cmpi eq, %2, %c0 : index
+    %9 = arith.select %7, %c1, %c0 : index
+    %10 = arith.addi %2, %9 : index
+    %11 = arith.andi %7, %8 : i1
+    %12 = arith.select %11, %device_n, %0 : !hal.device
+    %13 = arith.addi %1, %c1 : index
+    cf.br ^bb1(%13, %10, %12 : index, index, !hal.device)
+  ^bb5:  // pred: ^bb1
+    cf.cond_br %4, ^bb6, ^bb7
+  ^bb6:  // pred: ^bb5
+    util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>"
+    cf.br ^bb7
+  ^bb7:  // 2 preds: ^bb5, ^bb6
+    util.global.store %3, @__device_0 : !hal.device
+    util.return
+  }
+  util.global private @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  util.initializer {
+    %__device_0 = util.global.load @__device_0 : !hal.device
+    %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+    util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+    util.return
+  }
+  util.global private @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  util.initializer {
+    %c-1_i64 = arith.constant -1 : i64
+    %c-1 = arith.constant -1 : index
+    %c0 = arith.constant 0 : index
+    %c14_i32 = arith.constant 14 : i32
+    %0 = util.null : !hal.executable
+    %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+    %__device_0 = util.global.load @__device_0 : !hal.device
+    %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index
+    %2 = arith.cmpi eq, %1, %c0 : index
+    cf.cond_br %2, ^bb1, ^bb2
+  ^bb1:  // pred: ^bb0
+    %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable
+    cf.br ^bb3(%executable : !hal.executable)
+  ^bb2:  // pred: ^bb0
+    util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+    cf.br ^bb3(%0 : !hal.executable)
+  ^bb3(%3: !hal.executable):  // 2 preds: ^bb1, ^bb2
+    util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+    util.return
+  }
+  hal.executable private @multiple_results_dispatch_0 {
+    hal.executable.binary public @embedded_elf_arm_64 attributes {data = dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8>, format = "embedded-elf-arm_64", mime_type = "application/x-elf"}
+  }
+  util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} {
+    %c-1_i64 = arith.constant -1 : i64
+    %c3 = arith.constant 3 : index
+    %c0_i32 = arith.constant 0 : i32
+    %c0 = arith.constant 0 : index
+    %c2 = arith.constant 2 : index
+    %c8 = arith.constant 8 : index
+    %c128 = arith.constant 128 : index
+    %c64_i32 = arith.constant 64 : i32
+    %c1 = arith.constant 1 : index
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([
+      (%c0 : index)[%c0, %c8], 
+      (%c2 : index)[%c0, %c128]
+    ]) flags("None")
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([
+      (%c1 : index)[%c0, %c8], 
+      (%c2 : index)[%c0, %c128]
+    ]) flags("None")
+    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
+    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
+    util.return %cmd : !hal.command_buffer
+  }
+  util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  util.initializer {
+    %0 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer
+    util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    util.return
+  }
+  util.func private @__multiple_results_memoize_lookup() -> !hal.command_buffer {
+    %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    util.return %__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c-1_i32 = arith.constant -1 : i32
+    %c0_i64 = arith.constant 0 : i64
+    %0 = util.null : !hal.fence
+    %c-1_i64 = arith.constant -1 : i64
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c64 = arith.constant 64 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32
+    %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
+    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
+    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
+    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128}
+    %1 = util.call @__multiple_results_memoize_lookup() : () -> !hal.command_buffer
+    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+    hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%1) bindings([
+      (%buffer : !hal.buffer)[%c0, %c8], 
+      (%buffer_0 : !hal.buffer)[%c0, %c8], 
+      (%transient_buffer : !hal.buffer)[%c0, %c128]
+    ]) flags("None")
+    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32
+    util.status.check_ok %status, "failed to wait on timepoint"
+    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+    %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+    util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After IPOPass (iree-util-ipo) //----- //
+module attributes {iree.fixedpoint.iteration = 1 : index, iree.fixedpoint.modified} {
+  util.global private @__device_0 : !hal.device
+  util.initializer {
+    %c18_i32 = arith.constant 18 : i32
+    %false = arith.constant false
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %0 = util.null : !hal.device
+    %device_count = hal.devices.count : index
+    cf.br ^bb1(%c0, %c0, %0 : index, index, !hal.device)
+  ^bb1(%1: index, %2: index, %3: !hal.device):  // 2 preds: ^bb0, ^bb4
+    %4 = util.cmp.eq %3, %0 : !hal.device
+    %5 = arith.cmpi slt, %1, %device_count : index
+    %6 = arith.andi %4, %5 : i1
+    cf.cond_br %6, ^bb2, ^bb5
+  ^bb2:  // pred: ^bb1
+    %device_n = hal.devices.get %1 : !hal.device
+    %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
+    cf.cond_br %value, ^bb3, ^bb4(%false : i1)
+  ^bb3:  // pred: ^bb2
+    %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+    cf.br ^bb4(%value_1 : i1)
+  ^bb4(%7: i1):  // 2 preds: ^bb2, ^bb3
+    %8 = arith.cmpi eq, %2, %c0 : index
+    %9 = arith.select %7, %c1, %c0 : index
+    %10 = arith.addi %2, %9 : index
+    %11 = arith.andi %7, %8 : i1
+    %12 = arith.select %11, %device_n, %0 : !hal.device
+    %13 = arith.addi %1, %c1 : index
+    cf.br ^bb1(%13, %10, %12 : index, index, !hal.device)
+  ^bb5:  // pred: ^bb1
+    cf.cond_br %4, ^bb6, ^bb7
+  ^bb6:  // pred: ^bb5
+    util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>"
+    cf.br ^bb7
+  ^bb7:  // 2 preds: ^bb5, ^bb6
+    util.global.store %3, @__device_0 : !hal.device
+    util.return
+  }
+  util.global private @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  util.initializer {
+    %__device_0 = util.global.load @__device_0 : !hal.device
+    %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+    util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+    util.return
+  }
+  util.global private @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  util.initializer {
+    %c-1_i64 = arith.constant -1 : i64
+    %c-1 = arith.constant -1 : index
+    %c0 = arith.constant 0 : index
+    %c14_i32 = arith.constant 14 : i32
+    %0 = util.null : !hal.executable
+    %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+    %__device_0 = util.global.load @__device_0 : !hal.device
+    %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index
+    %2 = arith.cmpi eq, %1, %c0 : index
+    cf.cond_br %2, ^bb1, ^bb2
+  ^bb1:  // pred: ^bb0
+    %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable
+    cf.br ^bb3(%executable : !hal.executable)
+  ^bb2:  // pred: ^bb0
+    util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+    cf.br ^bb3(%0 : !hal.executable)
+  ^bb3(%3: !hal.executable):  // 2 preds: ^bb1, ^bb2
+    util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+    util.return
+  }
+  hal.executable private @multiple_results_dispatch_0 {
+    hal.executable.binary public @embedded_elf_arm_64 attributes {data = dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8>, format = "embedded-elf-arm_64", mime_type = "application/x-elf"}
+  }
+  util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} {
+    %c-1_i64 = arith.constant -1 : i64
+    %c3 = arith.constant 3 : index
+    %c0_i32 = arith.constant 0 : i32
+    %c0 = arith.constant 0 : index
+    %c2 = arith.constant 2 : index
+    %c8 = arith.constant 8 : index
+    %c128 = arith.constant 128 : index
+    %c64_i32 = arith.constant 64 : i32
+    %c1 = arith.constant 1 : index
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([
+      (%c0 : index)[%c0, %c8], 
+      (%c2 : index)[%c0, %c128]
+    ]) flags("None")
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([
+      (%c1 : index)[%c0, %c8], 
+      (%c2 : index)[%c0, %c128]
+    ]) flags("None")
+    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
+    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
+    util.return %cmd : !hal.command_buffer
+  }
+  util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  util.initializer {
+    %0 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer
+    util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    util.return
+  }
+  util.func private @__multiple_results_memoize_lookup() {
+    %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    util.return
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c-1_i32 = arith.constant -1 : i32
+    %c0_i64 = arith.constant 0 : i64
+    %0 = util.null : !hal.fence
+    %c-1_i64 = arith.constant -1 : i64
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c64 = arith.constant 64 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32
+    %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
+    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
+    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
+    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128}
+    util.call @__multiple_results_memoize_lookup() : () -> ()
+    %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+    hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%__multiple_results_memoize_result_0_device_0) bindings([
+      (%buffer : !hal.buffer)[%c0, %c8], 
+      (%buffer_0 : !hal.buffer)[%c0, %c8], 
+      (%transient_buffer : !hal.buffer)[%c0, %c128]
+    ]) flags("None")
+    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32
+    util.status.check_ok %status, "failed to wait on timepoint"
+    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+    %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+    util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+util.initializer {
+  %__device_0 = util.global.load @__device_0 : !hal.device
+  %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+  util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  util.return
+}
+
+// -----// IR Dump After CSE (cse) //----- //
+util.initializer {
+  %__device_0 = util.global.load @__device_0 : !hal.device
+  %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+  util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  util.return
+}
+
+// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- //
+util.initializer {
+  %__device_0 = util.global.load @__device_0 : !hal.device
+  %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+  util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  util.return
+}
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+util.initializer {
+  %c-1_i64 = arith.constant -1 : i64
+  %c-1 = arith.constant -1 : index
+  %c0 = arith.constant 0 : index
+  %c14_i32 = arith.constant 14 : i32
+  %0 = util.null : !hal.executable
+  %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  %__device_0 = util.global.load @__device_0 : !hal.device
+  %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index
+  %2 = arith.cmpi eq, %1, %c0 : index
+  cf.cond_br %2, ^bb1, ^bb2
+^bb1:  // pred: ^bb0
+  %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable
+  cf.br ^bb3(%executable : !hal.executable)
+^bb2:  // pred: ^bb0
+  util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+  cf.br ^bb3(%0 : !hal.executable)
+^bb3(%3: !hal.executable):  // 2 preds: ^bb1, ^bb2
+  util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  util.return
+}
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} {
+  %c-1_i64 = arith.constant -1 : i64
+  %c3 = arith.constant 3 : index
+  %c0_i32 = arith.constant 0 : i32
+  %c0 = arith.constant 0 : index
+  %c2 = arith.constant 2 : index
+  %c8 = arith.constant 8 : index
+  %c128 = arith.constant 128 : index
+  %c64_i32 = arith.constant 64 : i32
+  %c1 = arith.constant 1 : index
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer
+  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([
+    (%c0 : index)[%c0, %c8], 
+    (%c2 : index)[%c0, %c128]
+  ]) flags("None")
+  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([
+    (%c1 : index)[%c0, %c8], 
+    (%c2 : index)[%c0, %c128]
+  ]) flags("None")
+  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
+  hal.command_buffer.finalize<%cmd : !hal.command_buffer>
+  util.return %cmd : !hal.command_buffer
+}
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c-1_i32 = arith.constant -1 : i32
+  %c0_i64 = arith.constant 0 : i64
+  %0 = util.null : !hal.fence
+  %c-1_i64 = arith.constant -1 : i64
+  %c0 = arith.constant 0 : index
+  %c128 = arith.constant 128 : index
+  %c64 = arith.constant 64 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32
+  %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
+  %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
+  hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
+  hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+  %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+  %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128}
+  util.call @__multiple_results_memoize_lookup() : () -> ()
+  %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+  hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%__multiple_results_memoize_result_0_device_0) bindings([
+    (%buffer : !hal.buffer)[%c0, %c8], 
+    (%buffer_0 : !hal.buffer)[%c0, %c8], 
+    (%transient_buffer : !hal.buffer)[%c0, %c128]
+  ]) flags("None")
+  %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32
+  util.status.check_ok %status, "failed to wait on timepoint"
+  %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+  %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+  util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+util.func private @__multiple_results_memoize_lookup() {
+  util.return
+}
+
+// -----// IR Dump After CSE (cse) //----- //
+util.func private @__multiple_results_memoize_lookup() {
+  util.return
+}
+
+// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- //
+util.func private @__multiple_results_memoize_lookup() {
+  util.return
+}
+
+// -----// IR Dump After CSE (cse) //----- //
+util.initializer {
+  %c-1_i64 = arith.constant -1 : i64
+  %c-1 = arith.constant -1 : index
+  %c0 = arith.constant 0 : index
+  %c14_i32 = arith.constant 14 : i32
+  %0 = util.null : !hal.executable
+  %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  %__device_0 = util.global.load @__device_0 : !hal.device
+  %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index
+  %2 = arith.cmpi eq, %1, %c0 : index
+  cf.cond_br %2, ^bb1, ^bb2
+^bb1:  // pred: ^bb0
+  %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable
+  cf.br ^bb3(%executable : !hal.executable)
+^bb2:  // pred: ^bb0
+  util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+  cf.br ^bb3(%0 : !hal.executable)
+^bb3(%3: !hal.executable):  // 2 preds: ^bb1, ^bb2
+  util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  util.return
+}
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+util.initializer {
+  %c18_i32 = arith.constant 18 : i32
+  %false = arith.constant false
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %0 = util.null : !hal.device
+  %device_count = hal.devices.count : index
+  cf.br ^bb1(%c0, %c0, %0 : index, index, !hal.device)
+^bb1(%1: index, %2: index, %3: !hal.device):  // 2 preds: ^bb0, ^bb4
+  %4 = util.cmp.eq %3, %0 : !hal.device
+  %5 = arith.cmpi slt, %1, %device_count : index
+  %6 = arith.andi %4, %5 : i1
+  cf.cond_br %6, ^bb2, ^bb5
+^bb2:  // pred: ^bb1
+  %device_n = hal.devices.get %1 : !hal.device
+  %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
+  cf.cond_br %value, ^bb3, ^bb4(%false : i1)
+^bb3:  // pred: ^bb2
+  %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+  cf.br ^bb4(%value_1 : i1)
+^bb4(%7: i1):  // 2 preds: ^bb2, ^bb3
+  %8 = arith.cmpi eq, %2, %c0 : index
+  %9 = arith.select %7, %c1, %c0 : index
+  %10 = arith.addi %2, %9 : index
+  %11 = arith.andi %7, %8 : i1
+  %12 = arith.select %11, %device_n, %0 : !hal.device
+  %13 = arith.addi %1, %c1 : index
+  cf.br ^bb1(%13, %10, %12 : index, index, !hal.device)
+^bb5:  // pred: ^bb1
+  cf.cond_br %4, ^bb6, ^bb7
+^bb6:  // pred: ^bb5
+  util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>"
+  cf.br ^bb7
+^bb7:  // 2 preds: ^bb5, ^bb6
+  util.global.store %3, @__device_0 : !hal.device
+  util.return
+}
+
+// -----// IR Dump After CSE (cse) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c-1_i32 = arith.constant -1 : i32
+  %c0_i64 = arith.constant 0 : i64
+  %0 = util.null : !hal.fence
+  %c-1_i64 = arith.constant -1 : i64
+  %c0 = arith.constant 0 : index
+  %c128 = arith.constant 128 : index
+  %c64 = arith.constant 64 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32
+  %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
+  %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
+  hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
+  hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+  %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+  %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128}
+  util.call @__multiple_results_memoize_lookup() : () -> ()
+  %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+  hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%__multiple_results_memoize_result_0_device_0) bindings([
+    (%buffer : !hal.buffer)[%c0, %c8], 
+    (%buffer_0 : !hal.buffer)[%c0, %c8], 
+    (%transient_buffer : !hal.buffer)[%c0, %c128]
+  ]) flags("None")
+  %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32
+  util.status.check_ok %status, "failed to wait on timepoint"
+  %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+  %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+  util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- //
+util.initializer {
+  %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  %__device_0 = util.global.load @__device_0 : !hal.device
+  %c-1_i64 = arith.constant -1 : i64
+  %c-1 = arith.constant -1 : index
+  %c0 = arith.constant 0 : index
+  %c14_i32 = arith.constant 14 : i32
+  %0 = util.null : !hal.executable
+  %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index
+  %2 = arith.cmpi eq, %1, %c0 : index
+  cf.cond_br %2, ^bb1, ^bb2
+^bb1:  // pred: ^bb0
+  %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable
+  cf.br ^bb3(%executable : !hal.executable)
+^bb2:  // pred: ^bb0
+  util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+  cf.br ^bb3(%0 : !hal.executable)
+^bb3(%3: !hal.executable):  // 2 preds: ^bb1, ^bb2
+  util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  util.return
+}
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+util.initializer {
+  %0 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer
+  util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  util.return
+}
+
+// -----// IR Dump After CSE (cse) //----- //
+util.initializer {
+  %0 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer
+  util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  util.return
+}
+
+// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- //
+util.initializer {
+  %0 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer
+  util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  util.return
+}
+
+// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  %c-1_i32 = arith.constant -1 : i32
+  %c0_i64 = arith.constant 0 : i64
+  %0 = util.null : !hal.fence
+  %c-1_i64 = arith.constant -1 : i64
+  %c0 = arith.constant 0 : index
+  %c128 = arith.constant 128 : index
+  %c64 = arith.constant 64 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32
+  %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
+  %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
+  hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
+  hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+  %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+  %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128}
+  util.call @__multiple_results_memoize_lookup() : () -> ()
+  %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+  hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%__multiple_results_memoize_result_0_device_0) bindings([
+    (%buffer : !hal.buffer)[%c0, %c8], 
+    (%buffer_0 : !hal.buffer)[%c0, %c8], 
+    (%transient_buffer : !hal.buffer)[%c0, %c128]
+  ]) flags("None")
+  %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32
+  util.status.check_ok %status, "failed to wait on timepoint"
+  %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+  %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+  util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After CSE (cse) //----- //
+util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} {
+  %c-1_i64 = arith.constant -1 : i64
+  %c3 = arith.constant 3 : index
+  %c0_i32 = arith.constant 0 : i32
+  %c0 = arith.constant 0 : index
+  %c2 = arith.constant 2 : index
+  %c8 = arith.constant 8 : index
+  %c128 = arith.constant 128 : index
+  %c64_i32 = arith.constant 64 : i32
+  %c1 = arith.constant 1 : index
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer
+  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([
+    (%c0 : index)[%c0, %c8], 
+    (%c2 : index)[%c0, %c128]
+  ]) flags("None")
+  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([
+    (%c1 : index)[%c0, %c8], 
+    (%c2 : index)[%c0, %c128]
+  ]) flags("None")
+  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
+  hal.command_buffer.finalize<%cmd : !hal.command_buffer>
+  util.return %cmd : !hal.command_buffer
+}
+
+// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- //
+util.func private @__multiple_results_memoize_lookup() {
+  util.return
+}
+
+// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- //
+util.initializer {
+  %0 = util.null : !hal.executable
+  %c14_i32 = arith.constant 14 : i32
+  %c0 = arith.constant 0 : index
+  %c-1 = arith.constant -1 : index
+  %c-1_i64 = arith.constant -1 : i64
+  %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  %__device_0 = util.global.load @__device_0 : !hal.device
+  %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index
+  %2 = arith.cmpi eq, %1, %c0 : index
+  cf.cond_br %2, ^bb1, ^bb2
+^bb1:  // pred: ^bb0
+  %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable
+  cf.br ^bb3(%executable : !hal.executable)
+^bb2:  // pred: ^bb0
+  util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+  cf.br ^bb3(%0 : !hal.executable)
+^bb3(%3: !hal.executable):  // 2 preds: ^bb1, ^bb2
+  util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  util.return
+}
+
+// -----// IR Dump After CSE (cse) //----- //
+util.initializer {
+  %c18_i32 = arith.constant 18 : i32
+  %false = arith.constant false
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %0 = util.null : !hal.device
+  %device_count = hal.devices.count : index
+  cf.br ^bb1(%c0, %c0, %0 : index, index, !hal.device)
+^bb1(%1: index, %2: index, %3: !hal.device):  // 2 preds: ^bb0, ^bb4
+  %4 = util.cmp.eq %3, %0 : !hal.device
+  %5 = arith.cmpi slt, %1, %device_count : index
+  %6 = arith.andi %4, %5 : i1
+  cf.cond_br %6, ^bb2, ^bb5
+^bb2:  // pred: ^bb1
+  %device_n = hal.devices.get %1 : !hal.device
+  %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
+  cf.cond_br %value, ^bb3, ^bb4(%false : i1)
+^bb3:  // pred: ^bb2
+  %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+  cf.br ^bb4(%value_1 : i1)
+^bb4(%7: i1):  // 2 preds: ^bb2, ^bb3
+  %8 = arith.cmpi eq, %2, %c0 : index
+  %9 = arith.select %7, %c1, %c0 : index
+  %10 = arith.addi %2, %9 : index
+  %11 = arith.andi %7, %8 : i1
+  %12 = arith.select %11, %device_n, %0 : !hal.device
+  %13 = arith.addi %1, %c1 : index
+  cf.br ^bb1(%13, %10, %12 : index, index, !hal.device)
+^bb5:  // pred: ^bb1
+  cf.cond_br %4, ^bb6, ^bb7
+^bb6:  // pred: ^bb5
+  util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>"
+  cf.br ^bb7
+^bb7:  // 2 preds: ^bb5, ^bb6
+  util.global.store %3, @__device_0 : !hal.device
+  util.return
+}
+
+// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- //
+util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} {
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  %c-1_i64 = arith.constant -1 : i64
+  %c3 = arith.constant 3 : index
+  %c0_i32 = arith.constant 0 : i32
+  %c0 = arith.constant 0 : index
+  %c2 = arith.constant 2 : index
+  %c8 = arith.constant 8 : index
+  %c128 = arith.constant 128 : index
+  %c64_i32 = arith.constant 64 : i32
+  %c1 = arith.constant 1 : index
+  %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer
+  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([
+    (%c0 : index)[%c0, %c8], 
+    (%c2 : index)[%c0, %c128]
+  ]) flags("None")
+  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([
+    (%c1 : index)[%c0, %c8], 
+    (%c2 : index)[%c0, %c128]
+  ]) flags("None")
+  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
+  hal.command_buffer.finalize<%cmd : !hal.command_buffer>
+  util.return %cmd : !hal.command_buffer
+}
+
+// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- //
+util.initializer {
+  %0 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer
+  util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  util.return
+}
+
+// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- //
+util.initializer {
+  %c18_i32 = arith.constant 18 : i32
+  %false = arith.constant false
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %0 = util.null : !hal.device
+  %device_count = hal.devices.count : index
+  cf.br ^bb1(%c0, %c0, %0 : index, index, !hal.device)
+^bb1(%1: index, %2: index, %3: !hal.device):  // 2 preds: ^bb0, ^bb4
+  %4 = util.cmp.eq %3, %0 : !hal.device
+  %5 = arith.cmpi slt, %1, %device_count : index
+  %6 = arith.andi %4, %5 : i1
+  cf.cond_br %6, ^bb2, ^bb5
+^bb2:  // pred: ^bb1
+  %device_n = hal.devices.get %1 : !hal.device
+  %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
+  cf.cond_br %value, ^bb3, ^bb4(%false : i1)
+^bb3:  // pred: ^bb2
+  %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+  cf.br ^bb4(%value_1 : i1)
+^bb4(%7: i1):  // 2 preds: ^bb2, ^bb3
+  %8 = arith.cmpi eq, %2, %c0 : index
+  %9 = arith.select %7, %c1, %c0 : index
+  %10 = arith.addi %2, %9 : index
+  %11 = arith.andi %7, %8 : i1
+  %12 = arith.select %11, %device_n, %0 : !hal.device
+  %13 = arith.addi %1, %c1 : index
+  cf.br ^bb1(%13, %10, %12 : index, index, !hal.device)
+^bb5:  // pred: ^bb1
+  cf.cond_br %4, ^bb6, ^bb7
+^bb6:  // pred: ^bb5
+  util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>"
+  cf.br ^bb7
+^bb7:  // 2 preds: ^bb5, ^bb6
+  util.global.store %3, @__device_0 : !hal.device
+  util.return
+}
+
+// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32
+  %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32
+  %c2 = arith.constant 2 : index
+  %c8 = arith.constant 8 : index
+  %c64 = arith.constant 64 : index
+  %c128 = arith.constant 128 : index
+  %c0 = arith.constant 0 : index
+  %c-1_i64 = arith.constant -1 : i64
+  %0 = util.null : !hal.fence
+  %c0_i64 = arith.constant 0 : i64
+  %c-1_i32 = arith.constant -1 : i32
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
+  %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
+  hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
+  hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+  %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+  %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128}
+  util.call @__multiple_results_memoize_lookup() : () -> ()
+  %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+  hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%__multiple_results_memoize_result_0_device_0) bindings([
+    (%buffer : !hal.buffer)[%c0, %c8], 
+    (%buffer_0 : !hal.buffer)[%c0, %c8], 
+    (%transient_buffer : !hal.buffer)[%c0, %c128]
+  ]) flags("None")
+  %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32
+  util.status.check_ok %status, "failed to wait on timepoint"
+  %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+  %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+  util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- //
+util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} {
+  %c1 = arith.constant 1 : index
+  %c64_i32 = arith.constant 64 : i32
+  %c128 = arith.constant 128 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %c0 = arith.constant 0 : index
+  %c0_i32 = arith.constant 0 : i32
+  %c3 = arith.constant 3 : index
+  %c-1_i64 = arith.constant -1 : i64
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer
+  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([
+    (%c0 : index)[%c0, %c8], 
+    (%c2 : index)[%c0, %c128]
+  ]) flags("None")
+  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([
+    (%c1 : index)[%c0, %c8], 
+    (%c2 : index)[%c0, %c128]
+  ]) flags("None")
+  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
+  hal.command_buffer.finalize<%cmd : !hal.command_buffer>
+  util.return %cmd : !hal.command_buffer
+}
+
+// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- //
+util.initializer {
+  %c18_i32 = arith.constant 18 : i32
+  %false = arith.constant false
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %0 = util.null : !hal.device
+  %device_count = hal.devices.count : index
+  cf.br ^bb1(%c0, %c0, %0 : index, index, !hal.device)
+^bb1(%1: index, %2: index, %3: !hal.device):  // 2 preds: ^bb0, ^bb4
+  %4 = util.cmp.eq %3, %0 : !hal.device
+  %5 = arith.cmpi slt, %1, %device_count : index
+  %6 = arith.andi %4, %5 : i1
+  cf.cond_br %6, ^bb2, ^bb5
+^bb2:  // pred: ^bb1
+  %device_n = hal.devices.get %1 : !hal.device
+  %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
+  cf.cond_br %value, ^bb3, ^bb4(%false : i1)
+^bb3:  // pred: ^bb2
+  %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+  cf.br ^bb4(%value_1 : i1)
+^bb4(%7: i1):  // 2 preds: ^bb2, ^bb3
+  %8 = arith.cmpi eq, %2, %c0 : index
+  %9 = arith.select %7, %c1, %c0 : index
+  %10 = arith.addi %2, %9 : index
+  %11 = arith.andi %7, %8 : i1
+  %12 = arith.select %11, %device_n, %0 : !hal.device
+  %13 = arith.addi %1, %c1 : index
+  cf.br ^bb1(%13, %10, %12 : index, index, !hal.device)
+^bb5:  // pred: ^bb1
+  cf.cond_br %4, ^bb6, ^bb7
+^bb6:  // pred: ^bb5
+  util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>"
+  cf.br ^bb7
+^bb7:  // 2 preds: ^bb5, ^bb6
+  util.global.store %3, @__device_0 : !hal.device
+  util.return
+}
+
+// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- //
+util.initializer {
+  %__device_0 = util.global.load @__device_0 : !hal.device
+  %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+  util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  util.return
+}
+
+// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- //
+module attributes {iree.fixedpoint.iteration = 2 : index} {
+  util.global private @__device_0 : !hal.device
+  util.initializer {
+    %c18_i32 = arith.constant 18 : i32
+    %false = arith.constant false
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %0 = util.null : !hal.device
+    %device_count = hal.devices.count : index
+    cf.br ^bb1(%c0, %c0, %0 : index, index, !hal.device)
+  ^bb1(%1: index, %2: index, %3: !hal.device):  // 2 preds: ^bb0, ^bb4
+    %4 = util.cmp.eq %3, %0 : !hal.device
+    %5 = arith.cmpi slt, %1, %device_count : index
+    %6 = arith.andi %4, %5 : i1
+    cf.cond_br %6, ^bb2, ^bb5
+  ^bb2:  // pred: ^bb1
+    %device_n = hal.devices.get %1 : !hal.device
+    %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
+    cf.cond_br %value, ^bb3, ^bb4(%false : i1)
+  ^bb3:  // pred: ^bb2
+    %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+    cf.br ^bb4(%value_1 : i1)
+  ^bb4(%7: i1):  // 2 preds: ^bb2, ^bb3
+    %8 = arith.cmpi eq, %2, %c0 : index
+    %9 = arith.select %7, %c1, %c0 : index
+    %10 = arith.addi %2, %9 : index
+    %11 = arith.andi %7, %8 : i1
+    %12 = arith.select %11, %device_n, %0 : !hal.device
+    %13 = arith.addi %1, %c1 : index
+    cf.br ^bb1(%13, %10, %12 : index, index, !hal.device)
+  ^bb5:  // pred: ^bb1
+    cf.cond_br %4, ^bb6, ^bb7
+  ^bb6:  // pred: ^bb5
+    util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>"
+    cf.br ^bb7
+  ^bb7:  // 2 preds: ^bb5, ^bb6
+    util.global.store %3, @__device_0 : !hal.device
+    util.return
+  }
+  util.global private @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  util.initializer {
+    %__device_0 = util.global.load @__device_0 : !hal.device
+    %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+    util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+    util.return
+  }
+  util.global private @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  util.initializer {
+    %0 = util.null : !hal.executable
+    %c14_i32 = arith.constant 14 : i32
+    %c0 = arith.constant 0 : index
+    %c-1 = arith.constant -1 : index
+    %c-1_i64 = arith.constant -1 : i64
+    %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+    %__device_0 = util.global.load @__device_0 : !hal.device
+    %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index
+    %2 = arith.cmpi eq, %1, %c0 : index
+    cf.cond_br %2, ^bb1, ^bb2
+  ^bb1:  // pred: ^bb0
+    %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable
+    cf.br ^bb3(%executable : !hal.executable)
+  ^bb2:  // pred: ^bb0
+    util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+    cf.br ^bb3(%0 : !hal.executable)
+  ^bb3(%3: !hal.executable):  // 2 preds: ^bb1, ^bb2
+    util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+    util.return
+  }
+  hal.executable private @multiple_results_dispatch_0 {
+    hal.executable.binary public @embedded_elf_arm_64 attributes {data = dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8>, format = "embedded-elf-arm_64", mime_type = "application/x-elf"}
+  }
+  util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} {
+    %c1 = arith.constant 1 : index
+    %c64_i32 = arith.constant 64 : i32
+    %c128 = arith.constant 128 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %c0 = arith.constant 0 : index
+    %c0_i32 = arith.constant 0 : i32
+    %c3 = arith.constant 3 : index
+    %c-1_i64 = arith.constant -1 : i64
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([
+      (%c0 : index)[%c0, %c8], 
+      (%c2 : index)[%c0, %c128]
+    ]) flags("None")
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([
+      (%c1 : index)[%c0, %c8], 
+      (%c2 : index)[%c0, %c128]
+    ]) flags("None")
+    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
+    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
+    util.return %cmd : !hal.command_buffer
+  }
+  util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  util.initializer {
+    %0 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer
+    util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    util.return
+  }
+  util.func private @__multiple_results_memoize_lookup() {
+    util.return
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32
+    %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32
+    %c2 = arith.constant 2 : index
+    %c8 = arith.constant 8 : index
+    %c64 = arith.constant 64 : index
+    %c128 = arith.constant 128 : index
+    %c0 = arith.constant 0 : index
+    %c-1_i64 = arith.constant -1 : i64
+    %0 = util.null : !hal.fence
+    %c0_i64 = arith.constant 0 : i64
+    %c-1_i32 = arith.constant -1 : i32
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
+    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
+    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
+    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128}
+    util.call @__multiple_results_memoize_lookup() : () -> ()
+    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+    hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%__multiple_results_memoize_result_0_device_0) bindings([
+      (%buffer : !hal.buffer)[%c0, %c8], 
+      (%buffer_0 : !hal.buffer)[%c0, %c8], 
+      (%transient_buffer : !hal.buffer)[%c0, %c128]
+    ]) flags("None")
+    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32
+    util.status.check_ok %status, "failed to wait on timepoint"
+    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+    %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+    util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- //
+module attributes {iree.fixedpoint.iteration = 2 : index} {
+  util.global private @__device_0 : !hal.device
+  util.initializer {
+    %c18_i32 = arith.constant 18 : i32
+    %false = arith.constant false
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %0 = util.null : !hal.device
+    %device_count = hal.devices.count : index
+    cf.br ^bb1(%c0, %c0, %0 : index, index, !hal.device)
+  ^bb1(%1: index, %2: index, %3: !hal.device):  // 2 preds: ^bb0, ^bb4
+    %4 = util.cmp.eq %3, %0 : !hal.device
+    %5 = arith.cmpi slt, %1, %device_count : index
+    %6 = arith.andi %4, %5 : i1
+    cf.cond_br %6, ^bb2, ^bb5
+  ^bb2:  // pred: ^bb1
+    %device_n = hal.devices.get %1 : !hal.device
+    %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
+    cf.cond_br %value, ^bb3, ^bb4(%false : i1)
+  ^bb3:  // pred: ^bb2
+    %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+    cf.br ^bb4(%value_1 : i1)
+  ^bb4(%7: i1):  // 2 preds: ^bb2, ^bb3
+    %8 = arith.cmpi eq, %2, %c0 : index
+    %9 = arith.select %7, %c1, %c0 : index
+    %10 = arith.addi %2, %9 : index
+    %11 = arith.andi %7, %8 : i1
+    %12 = arith.select %11, %device_n, %0 : !hal.device
+    %13 = arith.addi %1, %c1 : index
+    cf.br ^bb1(%13, %10, %12 : index, index, !hal.device)
+  ^bb5:  // pred: ^bb1
+    cf.cond_br %4, ^bb6, ^bb7
+  ^bb6:  // pred: ^bb5
+    util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>"
+    cf.br ^bb7
+  ^bb7:  // 2 preds: ^bb5, ^bb6
+    util.global.store %3, @__device_0 : !hal.device
+    util.return
+  }
+  util.global private @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  util.initializer {
+    %__device_0 = util.global.load @__device_0 : !hal.device
+    %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+    util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+    util.return
+  }
+  util.global private @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  util.initializer {
+    %0 = util.null : !hal.executable
+    %c14_i32 = arith.constant 14 : i32
+    %c0 = arith.constant 0 : index
+    %c-1 = arith.constant -1 : index
+    %c-1_i64 = arith.constant -1 : i64
+    %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+    %__device_0 = util.global.load @__device_0 : !hal.device
+    %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index
+    %2 = arith.cmpi eq, %1, %c0 : index
+    cf.cond_br %2, ^bb1, ^bb2
+  ^bb1:  // pred: ^bb0
+    %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable
+    cf.br ^bb3(%executable : !hal.executable)
+  ^bb2:  // pred: ^bb0
+    util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+    cf.br ^bb3(%0 : !hal.executable)
+  ^bb3(%3: !hal.executable):  // 2 preds: ^bb1, ^bb2
+    util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+    util.return
+  }
+  hal.executable private @multiple_results_dispatch_0 {
+    hal.executable.binary public @embedded_elf_arm_64 attributes {data = dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8>, format = "embedded-elf-arm_64", mime_type = "application/x-elf"}
+  }
+  util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} {
+    %c1 = arith.constant 1 : index
+    %c64_i32 = arith.constant 64 : i32
+    %c128 = arith.constant 128 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %c0 = arith.constant 0 : index
+    %c0_i32 = arith.constant 0 : i32
+    %c3 = arith.constant 3 : index
+    %c-1_i64 = arith.constant -1 : i64
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([
+      (%c0 : index)[%c0, %c8], 
+      (%c2 : index)[%c0, %c128]
+    ]) flags("None")
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([
+      (%c1 : index)[%c0, %c8], 
+      (%c2 : index)[%c0, %c128]
+    ]) flags("None")
+    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
+    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
+    util.return %cmd : !hal.command_buffer
+  }
+  util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  util.initializer {
+    %0 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer
+    util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    util.return
+  }
+  util.func private @__multiple_results_memoize_lookup() {
+    util.return
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32
+    %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32
+    %c2 = arith.constant 2 : index
+    %c8 = arith.constant 8 : index
+    %c64 = arith.constant 64 : index
+    %c128 = arith.constant 128 : index
+    %c0 = arith.constant 0 : index
+    %c-1_i64 = arith.constant -1 : i64
+    %0 = util.null : !hal.fence
+    %c0_i64 = arith.constant 0 : i64
+    %c-1_i32 = arith.constant -1 : i32
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
+    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
+    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
+    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128}
+    util.call @__multiple_results_memoize_lookup() : () -> ()
+    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+    hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%__multiple_results_memoize_result_0_device_0) bindings([
+      (%buffer : !hal.buffer)[%c0, %c8], 
+      (%buffer_0 : !hal.buffer)[%c0, %c8], 
+      (%transient_buffer : !hal.buffer)[%c0, %c128]
+    ]) flags("None")
+    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32
+    util.status.check_ok %status, "failed to wait on timepoint"
+    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+    %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+    util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After IPOPass (iree-util-ipo) //----- //
+module attributes {iree.fixedpoint.iteration = 2 : index, iree.fixedpoint.modified} {
+  util.global private @__device_0 : !hal.device
+  util.initializer {
+    %c18_i32 = arith.constant 18 : i32
+    %false = arith.constant false
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %0 = util.null : !hal.device
+    %device_count = hal.devices.count : index
+    cf.br ^bb1(%c0, %c0, %0 : index, index, !hal.device)
+  ^bb1(%1: index, %2: index, %3: !hal.device):  // 2 preds: ^bb0, ^bb4
+    %4 = util.cmp.eq %3, %0 : !hal.device
+    %5 = arith.cmpi slt, %1, %device_count : index
+    %6 = arith.andi %4, %5 : i1
+    cf.cond_br %6, ^bb2, ^bb5
+  ^bb2:  // pred: ^bb1
+    %device_n = hal.devices.get %1 : !hal.device
+    %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
+    cf.cond_br %value, ^bb3, ^bb4(%false : i1)
+  ^bb3:  // pred: ^bb2
+    %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+    cf.br ^bb4(%value_1 : i1)
+  ^bb4(%7: i1):  // 2 preds: ^bb2, ^bb3
+    %8 = arith.cmpi eq, %2, %c0 : index
+    %9 = arith.select %7, %c1, %c0 : index
+    %10 = arith.addi %2, %9 : index
+    %11 = arith.andi %7, %8 : i1
+    %12 = arith.select %11, %device_n, %0 : !hal.device
+    %13 = arith.addi %1, %c1 : index
+    cf.br ^bb1(%13, %10, %12 : index, index, !hal.device)
+  ^bb5:  // pred: ^bb1
+    cf.cond_br %4, ^bb6, ^bb7
+  ^bb6:  // pred: ^bb5
+    util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>"
+    cf.br ^bb7
+  ^bb7:  // 2 preds: ^bb5, ^bb6
+    util.global.store %3, @__device_0 : !hal.device
+    util.return
+  }
+  util.global private @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  util.initializer {
+    %__device_0 = util.global.load @__device_0 : !hal.device
+    %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+    util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+    util.return
+  }
+  util.global private @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  util.initializer {
+    %0 = util.null : !hal.executable
+    %c14_i32 = arith.constant 14 : i32
+    %c0 = arith.constant 0 : index
+    %c-1 = arith.constant -1 : index
+    %c-1_i64 = arith.constant -1 : i64
+    %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+    %__device_0 = util.global.load @__device_0 : !hal.device
+    %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index
+    %2 = arith.cmpi eq, %1, %c0 : index
+    cf.cond_br %2, ^bb1, ^bb2
+  ^bb1:  // pred: ^bb0
+    %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable
+    cf.br ^bb3(%executable : !hal.executable)
+  ^bb2:  // pred: ^bb0
+    util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+    cf.br ^bb3(%0 : !hal.executable)
+  ^bb3(%3: !hal.executable):  // 2 preds: ^bb1, ^bb2
+    util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+    util.return
+  }
+  hal.executable private @multiple_results_dispatch_0 {
+    hal.executable.binary public @embedded_elf_arm_64 attributes {data = dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8>, format = "embedded-elf-arm_64", mime_type = "application/x-elf"}
+  }
+  util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} {
+    %c1 = arith.constant 1 : index
+    %c64_i32 = arith.constant 64 : i32
+    %c128 = arith.constant 128 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %c0 = arith.constant 0 : index
+    %c0_i32 = arith.constant 0 : i32
+    %c3 = arith.constant 3 : index
+    %c-1_i64 = arith.constant -1 : i64
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([
+      (%c0 : index)[%c0, %c8], 
+      (%c2 : index)[%c0, %c128]
+    ]) flags("None")
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([
+      (%c1 : index)[%c0, %c8], 
+      (%c2 : index)[%c0, %c128]
+    ]) flags("None")
+    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
+    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
+    util.return %cmd : !hal.command_buffer
+  }
+  util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  util.initializer {
+    %0 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer
+    util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    util.return
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32
+    %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32
+    %c2 = arith.constant 2 : index
+    %c8 = arith.constant 8 : index
+    %c64 = arith.constant 64 : index
+    %c128 = arith.constant 128 : index
+    %c0 = arith.constant 0 : index
+    %c-1_i64 = arith.constant -1 : i64
+    %0 = util.null : !hal.fence
+    %c0_i64 = arith.constant 0 : i64
+    %c-1_i32 = arith.constant -1 : i32
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
+    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
+    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
+    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128}
+    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+    hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%__multiple_results_memoize_result_0_device_0) bindings([
+      (%buffer : !hal.buffer)[%c0, %c8], 
+      (%buffer_0 : !hal.buffer)[%c0, %c8], 
+      (%transient_buffer : !hal.buffer)[%c0, %c128]
+    ]) flags("None")
+    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32
+    util.status.check_ok %status, "failed to wait on timepoint"
+    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+    %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+    util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+util.initializer {
+  %__device_0 = util.global.load @__device_0 : !hal.device
+  %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+  util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  util.return
+}
+
+// -----// IR Dump After CSE (cse) //----- //
+util.initializer {
+  %__device_0 = util.global.load @__device_0 : !hal.device
+  %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+  util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  util.return
+}
+
+// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- //
+util.initializer {
+  %__device_0 = util.global.load @__device_0 : !hal.device
+  %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+  util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  util.return
+}
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+util.initializer {
+  %0 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer
+  util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  util.return
+}
+
+// -----// IR Dump After CSE (cse) //----- //
+util.initializer {
+  %0 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer
+  util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  util.return
+}
+
+// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- //
+util.initializer {
+  %0 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer
+  util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  util.return
+}
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+util.initializer {
+  %0 = util.null : !hal.executable
+  %c14_i32 = arith.constant 14 : i32
+  %c0 = arith.constant 0 : index
+  %c-1 = arith.constant -1 : index
+  %c-1_i64 = arith.constant -1 : i64
+  %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  %__device_0 = util.global.load @__device_0 : !hal.device
+  %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index
+  %2 = arith.cmpi eq, %1, %c0 : index
+  cf.cond_br %2, ^bb1, ^bb2
+^bb1:  // pred: ^bb0
+  %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable
+  cf.br ^bb3(%executable : !hal.executable)
+^bb2:  // pred: ^bb0
+  util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+  cf.br ^bb3(%0 : !hal.executable)
+^bb3(%3: !hal.executable):  // 2 preds: ^bb1, ^bb2
+  util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  util.return
+}
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+util.initializer {
+  %c18_i32 = arith.constant 18 : i32
+  %false = arith.constant false
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %0 = util.null : !hal.device
+  %device_count = hal.devices.count : index
+  cf.br ^bb1(%c0, %c0, %0 : index, index, !hal.device)
+^bb1(%1: index, %2: index, %3: !hal.device):  // 2 preds: ^bb0, ^bb4
+  %4 = util.cmp.eq %3, %0 : !hal.device
+  %5 = arith.cmpi slt, %1, %device_count : index
+  %6 = arith.andi %4, %5 : i1
+  cf.cond_br %6, ^bb2, ^bb5
+^bb2:  // pred: ^bb1
+  %device_n = hal.devices.get %1 : !hal.device
+  %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
+  cf.cond_br %value, ^bb3, ^bb4(%false : i1)
+^bb3:  // pred: ^bb2
+  %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+  cf.br ^bb4(%value_1 : i1)
+^bb4(%7: i1):  // 2 preds: ^bb2, ^bb3
+  %8 = arith.cmpi eq, %2, %c0 : index
+  %9 = arith.select %7, %c1, %c0 : index
+  %10 = arith.addi %2, %9 : index
+  %11 = arith.andi %7, %8 : i1
+  %12 = arith.select %11, %device_n, %0 : !hal.device
+  %13 = arith.addi %1, %c1 : index
+  cf.br ^bb1(%13, %10, %12 : index, index, !hal.device)
+^bb5:  // pred: ^bb1
+  cf.cond_br %4, ^bb6, ^bb7
+^bb6:  // pred: ^bb5
+  util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>"
+  cf.br ^bb7
+^bb7:  // 2 preds: ^bb5, ^bb6
+  util.global.store %3, @__device_0 : !hal.device
+  util.return
+}
+
+// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- //
+util.initializer {
+  %0 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer
+  util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  util.return
+}
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} {
+  %c1 = arith.constant 1 : index
+  %c64_i32 = arith.constant 64 : i32
+  %c128 = arith.constant 128 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %c0 = arith.constant 0 : index
+  %c0_i32 = arith.constant 0 : i32
+  %c3 = arith.constant 3 : index
+  %c-1_i64 = arith.constant -1 : i64
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer
+  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([
+    (%c0 : index)[%c0, %c8], 
+    (%c2 : index)[%c0, %c128]
+  ]) flags("None")
+  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([
+    (%c1 : index)[%c0, %c8], 
+    (%c2 : index)[%c0, %c128]
+  ]) flags("None")
+  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
+  hal.command_buffer.finalize<%cmd : !hal.command_buffer>
+  util.return %cmd : !hal.command_buffer
+}
+
+// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- //
+util.initializer {
+  %__device_0 = util.global.load @__device_0 : !hal.device
+  %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+  util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  util.return
+}
+
+// -----// IR Dump After CSE (cse) //----- //
+util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} {
+  %c1 = arith.constant 1 : index
+  %c64_i32 = arith.constant 64 : i32
+  %c128 = arith.constant 128 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %c0 = arith.constant 0 : index
+  %c0_i32 = arith.constant 0 : i32
+  %c3 = arith.constant 3 : index
+  %c-1_i64 = arith.constant -1 : i64
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer
+  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([
+    (%c0 : index)[%c0, %c8], 
+    (%c2 : index)[%c0, %c128]
+  ]) flags("None")
+  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([
+    (%c1 : index)[%c0, %c8], 
+    (%c2 : index)[%c0, %c128]
+  ]) flags("None")
+  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
+  hal.command_buffer.finalize<%cmd : !hal.command_buffer>
+  util.return %cmd : !hal.command_buffer
+}
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32
+  %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32
+  %c2 = arith.constant 2 : index
+  %c8 = arith.constant 8 : index
+  %c64 = arith.constant 64 : index
+  %c128 = arith.constant 128 : index
+  %c0 = arith.constant 0 : index
+  %c-1_i64 = arith.constant -1 : i64
+  %0 = util.null : !hal.fence
+  %c0_i64 = arith.constant 0 : i64
+  %c-1_i32 = arith.constant -1 : i32
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
+  %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
+  hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
+  hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+  %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+  %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128}
+  %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+  hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%__multiple_results_memoize_result_0_device_0) bindings([
+    (%buffer : !hal.buffer)[%c0, %c8], 
+    (%buffer_0 : !hal.buffer)[%c0, %c8], 
+    (%transient_buffer : !hal.buffer)[%c0, %c128]
+  ]) flags("None")
+  %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32
+  util.status.check_ok %status, "failed to wait on timepoint"
+  %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+  %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+  util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After CSE (cse) //----- //
+util.initializer {
+  %c18_i32 = arith.constant 18 : i32
+  %false = arith.constant false
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %0 = util.null : !hal.device
+  %device_count = hal.devices.count : index
+  cf.br ^bb1(%c0, %c0, %0 : index, index, !hal.device)
+^bb1(%1: index, %2: index, %3: !hal.device):  // 2 preds: ^bb0, ^bb4
+  %4 = util.cmp.eq %3, %0 : !hal.device
+  %5 = arith.cmpi slt, %1, %device_count : index
+  %6 = arith.andi %4, %5 : i1
+  cf.cond_br %6, ^bb2, ^bb5
+^bb2:  // pred: ^bb1
+  %device_n = hal.devices.get %1 : !hal.device
+  %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
+  cf.cond_br %value, ^bb3, ^bb4(%false : i1)
+^bb3:  // pred: ^bb2
+  %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+  cf.br ^bb4(%value_1 : i1)
+^bb4(%7: i1):  // 2 preds: ^bb2, ^bb3
+  %8 = arith.cmpi eq, %2, %c0 : index
+  %9 = arith.select %7, %c1, %c0 : index
+  %10 = arith.addi %2, %9 : index
+  %11 = arith.andi %7, %8 : i1
+  %12 = arith.select %11, %device_n, %0 : !hal.device
+  %13 = arith.addi %1, %c1 : index
+  cf.br ^bb1(%13, %10, %12 : index, index, !hal.device)
+^bb5:  // pred: ^bb1
+  cf.cond_br %4, ^bb6, ^bb7
+^bb6:  // pred: ^bb5
+  util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>"
+  cf.br ^bb7
+^bb7:  // 2 preds: ^bb5, ^bb6
+  util.global.store %3, @__device_0 : !hal.device
+  util.return
+}
+
+// -----// IR Dump After CSE (cse) //----- //
+util.initializer {
+  %0 = util.null : !hal.executable
+  %c14_i32 = arith.constant 14 : i32
+  %c0 = arith.constant 0 : index
+  %c-1 = arith.constant -1 : index
+  %c-1_i64 = arith.constant -1 : i64
+  %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  %__device_0 = util.global.load @__device_0 : !hal.device
+  %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index
+  %2 = arith.cmpi eq, %1, %c0 : index
+  cf.cond_br %2, ^bb1, ^bb2
+^bb1:  // pred: ^bb0
+  %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable
+  cf.br ^bb3(%executable : !hal.executable)
+^bb2:  // pred: ^bb0
+  util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+  cf.br ^bb3(%0 : !hal.executable)
+^bb3(%3: !hal.executable):  // 2 preds: ^bb1, ^bb2
+  util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  util.return
+}
+
+// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- //
+util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} {
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  %c1 = arith.constant 1 : index
+  %c64_i32 = arith.constant 64 : i32
+  %c128 = arith.constant 128 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %c0 = arith.constant 0 : index
+  %c0_i32 = arith.constant 0 : i32
+  %c3 = arith.constant 3 : index
+  %c-1_i64 = arith.constant -1 : i64
+  %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer
+  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([
+    (%c0 : index)[%c0, %c8], 
+    (%c2 : index)[%c0, %c128]
+  ]) flags("None")
+  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([
+    (%c1 : index)[%c0, %c8], 
+    (%c2 : index)[%c0, %c128]
+  ]) flags("None")
+  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
+  hal.command_buffer.finalize<%cmd : !hal.command_buffer>
+  util.return %cmd : !hal.command_buffer
+}
+
+// -----// IR Dump After CSE (cse) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32
+  %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32
+  %c2 = arith.constant 2 : index
+  %c8 = arith.constant 8 : index
+  %c64 = arith.constant 64 : index
+  %c128 = arith.constant 128 : index
+  %c0 = arith.constant 0 : index
+  %c-1_i64 = arith.constant -1 : i64
+  %0 = util.null : !hal.fence
+  %c0_i64 = arith.constant 0 : i64
+  %c-1_i32 = arith.constant -1 : i32
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
+  %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
+  hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
+  hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+  %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+  %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128}
+  %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+  hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%__multiple_results_memoize_result_0_device_0) bindings([
+    (%buffer : !hal.buffer)[%c0, %c8], 
+    (%buffer_0 : !hal.buffer)[%c0, %c8], 
+    (%transient_buffer : !hal.buffer)[%c0, %c128]
+  ]) flags("None")
+  %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32
+  util.status.check_ok %status, "failed to wait on timepoint"
+  %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+  %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+  util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- //
+util.initializer {
+  %c18_i32 = arith.constant 18 : i32
+  %false = arith.constant false
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %0 = util.null : !hal.device
+  %device_count = hal.devices.count : index
+  cf.br ^bb1(%c0, %c0, %0 : index, index, !hal.device)
+^bb1(%1: index, %2: index, %3: !hal.device):  // 2 preds: ^bb0, ^bb4
+  %4 = util.cmp.eq %3, %0 : !hal.device
+  %5 = arith.cmpi slt, %1, %device_count : index
+  %6 = arith.andi %4, %5 : i1
+  cf.cond_br %6, ^bb2, ^bb5
+^bb2:  // pred: ^bb1
+  %device_n = hal.devices.get %1 : !hal.device
+  %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
+  cf.cond_br %value, ^bb3, ^bb4(%false : i1)
+^bb3:  // pred: ^bb2
+  %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+  cf.br ^bb4(%value_1 : i1)
+^bb4(%7: i1):  // 2 preds: ^bb2, ^bb3
+  %8 = arith.cmpi eq, %2, %c0 : index
+  %9 = arith.select %7, %c1, %c0 : index
+  %10 = arith.addi %2, %9 : index
+  %11 = arith.andi %7, %8 : i1
+  %12 = arith.select %11, %device_n, %0 : !hal.device
+  %13 = arith.addi %1, %c1 : index
+  cf.br ^bb1(%13, %10, %12 : index, index, !hal.device)
+^bb5:  // pred: ^bb1
+  cf.cond_br %4, ^bb6, ^bb7
+^bb6:  // pred: ^bb5
+  util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>"
+  cf.br ^bb7
+^bb7:  // 2 preds: ^bb5, ^bb6
+  util.global.store %3, @__device_0 : !hal.device
+  util.return
+}
+
+// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- //
+util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} {
+  %c-1_i64 = arith.constant -1 : i64
+  %c3 = arith.constant 3 : index
+  %c0_i32 = arith.constant 0 : i32
+  %c0 = arith.constant 0 : index
+  %c2 = arith.constant 2 : index
+  %c8 = arith.constant 8 : index
+  %c128 = arith.constant 128 : index
+  %c64_i32 = arith.constant 64 : i32
+  %c1 = arith.constant 1 : index
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer
+  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([
+    (%c0 : index)[%c0, %c8], 
+    (%c2 : index)[%c0, %c128]
+  ]) flags("None")
+  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([
+    (%c1 : index)[%c0, %c8], 
+    (%c2 : index)[%c0, %c128]
+  ]) flags("None")
+  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
+  hal.command_buffer.finalize<%cmd : !hal.command_buffer>
+  util.return %cmd : !hal.command_buffer
+}
+
+// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32
+  %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32
+  %c2 = arith.constant 2 : index
+  %c8 = arith.constant 8 : index
+  %c64 = arith.constant 64 : index
+  %c128 = arith.constant 128 : index
+  %c0 = arith.constant 0 : index
+  %c-1_i64 = arith.constant -1 : i64
+  %0 = util.null : !hal.fence
+  %c0_i64 = arith.constant 0 : i64
+  %c-1_i32 = arith.constant -1 : i32
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
+  %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
+  hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
+  hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+  %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+  %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128}
+  %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+  hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%__multiple_results_memoize_result_0_device_0) bindings([
+    (%buffer : !hal.buffer)[%c0, %c8], 
+    (%buffer_0 : !hal.buffer)[%c0, %c8], 
+    (%transient_buffer : !hal.buffer)[%c0, %c128]
+  ]) flags("None")
+  %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32
+  util.status.check_ok %status, "failed to wait on timepoint"
+  %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+  %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+  util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- //
+util.initializer {
+  %c18_i32 = arith.constant 18 : i32
+  %false = arith.constant false
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %0 = util.null : !hal.device
+  %device_count = hal.devices.count : index
+  cf.br ^bb1(%c0, %c0, %0 : index, index, !hal.device)
+^bb1(%1: index, %2: index, %3: !hal.device):  // 2 preds: ^bb0, ^bb4
+  %4 = util.cmp.eq %3, %0 : !hal.device
+  %5 = arith.cmpi slt, %1, %device_count : index
+  %6 = arith.andi %4, %5 : i1
+  cf.cond_br %6, ^bb2, ^bb5
+^bb2:  // pred: ^bb1
+  %device_n = hal.devices.get %1 : !hal.device
+  %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
+  cf.cond_br %value, ^bb3, ^bb4(%false : i1)
+^bb3:  // pred: ^bb2
+  %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+  cf.br ^bb4(%value_1 : i1)
+^bb4(%7: i1):  // 2 preds: ^bb2, ^bb3
+  %8 = arith.cmpi eq, %2, %c0 : index
+  %9 = arith.select %7, %c1, %c0 : index
+  %10 = arith.addi %2, %9 : index
+  %11 = arith.andi %7, %8 : i1
+  %12 = arith.select %11, %device_n, %0 : !hal.device
+  %13 = arith.addi %1, %c1 : index
+  cf.br ^bb1(%13, %10, %12 : index, index, !hal.device)
+^bb5:  // pred: ^bb1
+  cf.cond_br %4, ^bb6, ^bb7
+^bb6:  // pred: ^bb5
+  util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>"
+  cf.br ^bb7
+^bb7:  // 2 preds: ^bb5, ^bb6
+  util.global.store %3, @__device_0 : !hal.device
+  util.return
+}
+
+// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- //
+util.initializer {
+  %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  %__device_0 = util.global.load @__device_0 : !hal.device
+  %0 = util.null : !hal.executable
+  %c14_i32 = arith.constant 14 : i32
+  %c0 = arith.constant 0 : index
+  %c-1 = arith.constant -1 : index
+  %c-1_i64 = arith.constant -1 : i64
+  %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index
+  %2 = arith.cmpi eq, %1, %c0 : index
+  cf.cond_br %2, ^bb1, ^bb2
+^bb1:  // pred: ^bb0
+  %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable
+  cf.br ^bb3(%executable : !hal.executable)
+^bb2:  // pred: ^bb0
+  util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+  cf.br ^bb3(%0 : !hal.executable)
+^bb3(%3: !hal.executable):  // 2 preds: ^bb1, ^bb2
+  util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  util.return
+}
+
+// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c-1_i32 = arith.constant -1 : i32
+  %c0_i64 = arith.constant 0 : i64
+  %0 = util.null : !hal.fence
+  %c-1_i64 = arith.constant -1 : i64
+  %c0 = arith.constant 0 : index
+  %c128 = arith.constant 128 : index
+  %c64 = arith.constant 64 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32
+  %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
+  %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
+  hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
+  hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+  %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+  %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128}
+  %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+  hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%__multiple_results_memoize_result_0_device_0) bindings([
+    (%buffer : !hal.buffer)[%c0, %c8], 
+    (%buffer_0 : !hal.buffer)[%c0, %c8], 
+    (%transient_buffer : !hal.buffer)[%c0, %c128]
+  ]) flags("None")
+  %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32
+  util.status.check_ok %status, "failed to wait on timepoint"
+  %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+  %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+  util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- //
+util.initializer {
+  %c-1_i64 = arith.constant -1 : i64
+  %c-1 = arith.constant -1 : index
+  %c0 = arith.constant 0 : index
+  %c14_i32 = arith.constant 14 : i32
+  %0 = util.null : !hal.executable
+  %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  %__device_0 = util.global.load @__device_0 : !hal.device
+  %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index
+  %2 = arith.cmpi eq, %1, %c0 : index
+  cf.cond_br %2, ^bb1, ^bb2
+^bb1:  // pred: ^bb0
+  %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable
+  cf.br ^bb3(%executable : !hal.executable)
+^bb2:  // pred: ^bb0
+  util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+  cf.br ^bb3(%0 : !hal.executable)
+^bb3(%3: !hal.executable):  // 2 preds: ^bb1, ^bb2
+  util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  util.return
+}
+
+// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- //
+module attributes {iree.fixedpoint.iteration = 3 : index} {
+  util.global private @__device_0 : !hal.device
+  util.initializer {
+    %c18_i32 = arith.constant 18 : i32
+    %false = arith.constant false
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %0 = util.null : !hal.device
+    %device_count = hal.devices.count : index
+    cf.br ^bb1(%c0, %c0, %0 : index, index, !hal.device)
+  ^bb1(%1: index, %2: index, %3: !hal.device):  // 2 preds: ^bb0, ^bb4
+    %4 = util.cmp.eq %3, %0 : !hal.device
+    %5 = arith.cmpi slt, %1, %device_count : index
+    %6 = arith.andi %4, %5 : i1
+    cf.cond_br %6, ^bb2, ^bb5
+  ^bb2:  // pred: ^bb1
+    %device_n = hal.devices.get %1 : !hal.device
+    %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
+    cf.cond_br %value, ^bb3, ^bb4(%false : i1)
+  ^bb3:  // pred: ^bb2
+    %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+    cf.br ^bb4(%value_1 : i1)
+  ^bb4(%7: i1):  // 2 preds: ^bb2, ^bb3
+    %8 = arith.cmpi eq, %2, %c0 : index
+    %9 = arith.select %7, %c1, %c0 : index
+    %10 = arith.addi %2, %9 : index
+    %11 = arith.andi %7, %8 : i1
+    %12 = arith.select %11, %device_n, %0 : !hal.device
+    %13 = arith.addi %1, %c1 : index
+    cf.br ^bb1(%13, %10, %12 : index, index, !hal.device)
+  ^bb5:  // pred: ^bb1
+    cf.cond_br %4, ^bb6, ^bb7
+  ^bb6:  // pred: ^bb5
+    util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>"
+    cf.br ^bb7
+  ^bb7:  // 2 preds: ^bb5, ^bb6
+    util.global.store %3, @__device_0 : !hal.device
+    util.return
+  }
+  util.global private @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  util.initializer {
+    %__device_0 = util.global.load @__device_0 : !hal.device
+    %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+    util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+    util.return
+  }
+  util.global private @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  util.initializer {
+    %c-1_i64 = arith.constant -1 : i64
+    %c-1 = arith.constant -1 : index
+    %c0 = arith.constant 0 : index
+    %c14_i32 = arith.constant 14 : i32
+    %0 = util.null : !hal.executable
+    %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+    %__device_0 = util.global.load @__device_0 : !hal.device
+    %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index
+    %2 = arith.cmpi eq, %1, %c0 : index
+    cf.cond_br %2, ^bb1, ^bb2
+  ^bb1:  // pred: ^bb0
+    %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable
+    cf.br ^bb3(%executable : !hal.executable)
+  ^bb2:  // pred: ^bb0
+    util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+    cf.br ^bb3(%0 : !hal.executable)
+  ^bb3(%3: !hal.executable):  // 2 preds: ^bb1, ^bb2
+    util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+    util.return
+  }
+  hal.executable private @multiple_results_dispatch_0 {
+    hal.executable.binary public @embedded_elf_arm_64 attributes {data = dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8>, format = "embedded-elf-arm_64", mime_type = "application/x-elf"}
+  }
+  util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} {
+    %c-1_i64 = arith.constant -1 : i64
+    %c3 = arith.constant 3 : index
+    %c0_i32 = arith.constant 0 : i32
+    %c0 = arith.constant 0 : index
+    %c2 = arith.constant 2 : index
+    %c8 = arith.constant 8 : index
+    %c128 = arith.constant 128 : index
+    %c64_i32 = arith.constant 64 : i32
+    %c1 = arith.constant 1 : index
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([
+      (%c0 : index)[%c0, %c8], 
+      (%c2 : index)[%c0, %c128]
+    ]) flags("None")
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([
+      (%c1 : index)[%c0, %c8], 
+      (%c2 : index)[%c0, %c128]
+    ]) flags("None")
+    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
+    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
+    util.return %cmd : !hal.command_buffer
+  }
+  util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  util.initializer {
+    %0 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer
+    util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    util.return
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c-1_i32 = arith.constant -1 : i32
+    %c0_i64 = arith.constant 0 : i64
+    %0 = util.null : !hal.fence
+    %c-1_i64 = arith.constant -1 : i64
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c64 = arith.constant 64 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32
+    %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
+    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
+    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
+    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128}
+    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+    hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%__multiple_results_memoize_result_0_device_0) bindings([
+      (%buffer : !hal.buffer)[%c0, %c8], 
+      (%buffer_0 : !hal.buffer)[%c0, %c8], 
+      (%transient_buffer : !hal.buffer)[%c0, %c128]
+    ]) flags("None")
+    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32
+    util.status.check_ok %status, "failed to wait on timepoint"
+    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+    %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+    util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- //
+module attributes {iree.fixedpoint.iteration = 3 : index} {
+  util.global private @__device_0 : !hal.device
+  util.initializer {
+    %c18_i32 = arith.constant 18 : i32
+    %false = arith.constant false
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %0 = util.null : !hal.device
+    %device_count = hal.devices.count : index
+    cf.br ^bb1(%c0, %c0, %0 : index, index, !hal.device)
+  ^bb1(%1: index, %2: index, %3: !hal.device):  // 2 preds: ^bb0, ^bb4
+    %4 = util.cmp.eq %3, %0 : !hal.device
+    %5 = arith.cmpi slt, %1, %device_count : index
+    %6 = arith.andi %4, %5 : i1
+    cf.cond_br %6, ^bb2, ^bb5
+  ^bb2:  // pred: ^bb1
+    %device_n = hal.devices.get %1 : !hal.device
+    %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
+    cf.cond_br %value, ^bb3, ^bb4(%false : i1)
+  ^bb3:  // pred: ^bb2
+    %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+    cf.br ^bb4(%value_1 : i1)
+  ^bb4(%7: i1):  // 2 preds: ^bb2, ^bb3
+    %8 = arith.cmpi eq, %2, %c0 : index
+    %9 = arith.select %7, %c1, %c0 : index
+    %10 = arith.addi %2, %9 : index
+    %11 = arith.andi %7, %8 : i1
+    %12 = arith.select %11, %device_n, %0 : !hal.device
+    %13 = arith.addi %1, %c1 : index
+    cf.br ^bb1(%13, %10, %12 : index, index, !hal.device)
+  ^bb5:  // pred: ^bb1
+    cf.cond_br %4, ^bb6, ^bb7
+  ^bb6:  // pred: ^bb5
+    util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>"
+    cf.br ^bb7
+  ^bb7:  // 2 preds: ^bb5, ^bb6
+    util.global.store %3, @__device_0 : !hal.device
+    util.return
+  }
+  util.global private @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  util.initializer {
+    %__device_0 = util.global.load @__device_0 : !hal.device
+    %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+    util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+    util.return
+  }
+  util.global private @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  util.initializer {
+    %c-1_i64 = arith.constant -1 : i64
+    %c-1 = arith.constant -1 : index
+    %c0 = arith.constant 0 : index
+    %c14_i32 = arith.constant 14 : i32
+    %0 = util.null : !hal.executable
+    %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+    %__device_0 = util.global.load @__device_0 : !hal.device
+    %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index
+    %2 = arith.cmpi eq, %1, %c0 : index
+    cf.cond_br %2, ^bb1, ^bb2
+  ^bb1:  // pred: ^bb0
+    %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable
+    cf.br ^bb3(%executable : !hal.executable)
+  ^bb2:  // pred: ^bb0
+    util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+    cf.br ^bb3(%0 : !hal.executable)
+  ^bb3(%3: !hal.executable):  // 2 preds: ^bb1, ^bb2
+    util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+    util.return
+  }
+  hal.executable private @multiple_results_dispatch_0 {
+    hal.executable.binary public @embedded_elf_arm_64 attributes {data = dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8>, format = "embedded-elf-arm_64", mime_type = "application/x-elf"}
+  }
+  util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} {
+    %c-1_i64 = arith.constant -1 : i64
+    %c3 = arith.constant 3 : index
+    %c0_i32 = arith.constant 0 : i32
+    %c0 = arith.constant 0 : index
+    %c2 = arith.constant 2 : index
+    %c8 = arith.constant 8 : index
+    %c128 = arith.constant 128 : index
+    %c64_i32 = arith.constant 64 : i32
+    %c1 = arith.constant 1 : index
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([
+      (%c0 : index)[%c0, %c8], 
+      (%c2 : index)[%c0, %c128]
+    ]) flags("None")
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([
+      (%c1 : index)[%c0, %c8], 
+      (%c2 : index)[%c0, %c128]
+    ]) flags("None")
+    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
+    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
+    util.return %cmd : !hal.command_buffer
+  }
+  util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  util.initializer {
+    %0 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer
+    util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    util.return
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c-1_i32 = arith.constant -1 : i32
+    %c0_i64 = arith.constant 0 : i64
+    %0 = util.null : !hal.fence
+    %c-1_i64 = arith.constant -1 : i64
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c64 = arith.constant 64 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32
+    %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
+    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
+    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
+    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128}
+    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+    hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%__multiple_results_memoize_result_0_device_0) bindings([
+      (%buffer : !hal.buffer)[%c0, %c8], 
+      (%buffer_0 : !hal.buffer)[%c0, %c8], 
+      (%transient_buffer : !hal.buffer)[%c0, %c128]
+    ]) flags("None")
+    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32
+    util.status.check_ok %status, "failed to wait on timepoint"
+    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+    %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+    util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After IPOPass (iree-util-ipo) //----- //
+module attributes {iree.fixedpoint.iteration = 3 : index} {
+  util.global private @__device_0 : !hal.device
+  util.initializer {
+    %c18_i32 = arith.constant 18 : i32
+    %false = arith.constant false
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %0 = util.null : !hal.device
+    %device_count = hal.devices.count : index
+    cf.br ^bb1(%c0, %c0, %0 : index, index, !hal.device)
+  ^bb1(%1: index, %2: index, %3: !hal.device):  // 2 preds: ^bb0, ^bb4
+    %4 = util.cmp.eq %3, %0 : !hal.device
+    %5 = arith.cmpi slt, %1, %device_count : index
+    %6 = arith.andi %4, %5 : i1
+    cf.cond_br %6, ^bb2, ^bb5
+  ^bb2:  // pred: ^bb1
+    %device_n = hal.devices.get %1 : !hal.device
+    %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
+    cf.cond_br %value, ^bb3, ^bb4(%false : i1)
+  ^bb3:  // pred: ^bb2
+    %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+    cf.br ^bb4(%value_1 : i1)
+  ^bb4(%7: i1):  // 2 preds: ^bb2, ^bb3
+    %8 = arith.cmpi eq, %2, %c0 : index
+    %9 = arith.select %7, %c1, %c0 : index
+    %10 = arith.addi %2, %9 : index
+    %11 = arith.andi %7, %8 : i1
+    %12 = arith.select %11, %device_n, %0 : !hal.device
+    %13 = arith.addi %1, %c1 : index
+    cf.br ^bb1(%13, %10, %12 : index, index, !hal.device)
+  ^bb5:  // pred: ^bb1
+    cf.cond_br %4, ^bb6, ^bb7
+  ^bb6:  // pred: ^bb5
+    util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>"
+    cf.br ^bb7
+  ^bb7:  // 2 preds: ^bb5, ^bb6
+    util.global.store %3, @__device_0 : !hal.device
+    util.return
+  }
+  util.global private @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  util.initializer {
+    %__device_0 = util.global.load @__device_0 : !hal.device
+    %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+    util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+    util.return
+  }
+  util.global private @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  util.initializer {
+    %c-1_i64 = arith.constant -1 : i64
+    %c-1 = arith.constant -1 : index
+    %c0 = arith.constant 0 : index
+    %c14_i32 = arith.constant 14 : i32
+    %0 = util.null : !hal.executable
+    %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+    %__device_0 = util.global.load @__device_0 : !hal.device
+    %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index
+    %2 = arith.cmpi eq, %1, %c0 : index
+    cf.cond_br %2, ^bb1, ^bb2
+  ^bb1:  // pred: ^bb0
+    %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable
+    cf.br ^bb3(%executable : !hal.executable)
+  ^bb2:  // pred: ^bb0
+    util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+    cf.br ^bb3(%0 : !hal.executable)
+  ^bb3(%3: !hal.executable):  // 2 preds: ^bb1, ^bb2
+    util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+    util.return
+  }
+  hal.executable private @multiple_results_dispatch_0 {
+    hal.executable.binary public @embedded_elf_arm_64 attributes {data = dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8>, format = "embedded-elf-arm_64", mime_type = "application/x-elf"}
+  }
+  util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} {
+    %c-1_i64 = arith.constant -1 : i64
+    %c3 = arith.constant 3 : index
+    %c0_i32 = arith.constant 0 : i32
+    %c0 = arith.constant 0 : index
+    %c2 = arith.constant 2 : index
+    %c8 = arith.constant 8 : index
+    %c128 = arith.constant 128 : index
+    %c64_i32 = arith.constant 64 : i32
+    %c1 = arith.constant 1 : index
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([
+      (%c0 : index)[%c0, %c8], 
+      (%c2 : index)[%c0, %c128]
+    ]) flags("None")
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([
+      (%c1 : index)[%c0, %c8], 
+      (%c2 : index)[%c0, %c128]
+    ]) flags("None")
+    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
+    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
+    util.return %cmd : !hal.command_buffer
+  }
+  util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  util.initializer {
+    %0 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer
+    util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    util.return
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c-1_i32 = arith.constant -1 : i32
+    %c0_i64 = arith.constant 0 : i64
+    %0 = util.null : !hal.fence
+    %c-1_i64 = arith.constant -1 : i64
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c64 = arith.constant 64 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32
+    %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
+    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
+    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
+    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128}
+    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+    hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%__multiple_results_memoize_result_0_device_0) bindings([
+      (%buffer : !hal.buffer)[%c0, %c8], 
+      (%buffer_0 : !hal.buffer)[%c0, %c8], 
+      (%transient_buffer : !hal.buffer)[%c0, %c128]
+    ]) flags("None")
+    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32
+    util.status.check_ok %status, "failed to wait on timepoint"
+    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+    %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+    util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After FixedPointIteratorPass (iree-util-fixed-point-iterator) //----- //
+module {
+  util.global private @__device_0 : !hal.device
+  util.initializer {
+    %c18_i32 = arith.constant 18 : i32
+    %false = arith.constant false
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %0 = util.null : !hal.device
+    %device_count = hal.devices.count : index
+    cf.br ^bb1(%c0, %c0, %0 : index, index, !hal.device)
+  ^bb1(%1: index, %2: index, %3: !hal.device):  // 2 preds: ^bb0, ^bb4
+    %4 = util.cmp.eq %3, %0 : !hal.device
+    %5 = arith.cmpi slt, %1, %device_count : index
+    %6 = arith.andi %4, %5 : i1
+    cf.cond_br %6, ^bb2, ^bb5
+  ^bb2:  // pred: ^bb1
+    %device_n = hal.devices.get %1 : !hal.device
+    %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
+    cf.cond_br %value, ^bb3, ^bb4(%false : i1)
+  ^bb3:  // pred: ^bb2
+    %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+    cf.br ^bb4(%value_1 : i1)
+  ^bb4(%7: i1):  // 2 preds: ^bb2, ^bb3
+    %8 = arith.cmpi eq, %2, %c0 : index
+    %9 = arith.select %7, %c1, %c0 : index
+    %10 = arith.addi %2, %9 : index
+    %11 = arith.andi %7, %8 : i1
+    %12 = arith.select %11, %device_n, %0 : !hal.device
+    %13 = arith.addi %1, %c1 : index
+    cf.br ^bb1(%13, %10, %12 : index, index, !hal.device)
+  ^bb5:  // pred: ^bb1
+    cf.cond_br %4, ^bb6, ^bb7
+  ^bb6:  // pred: ^bb5
+    util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>"
+    cf.br ^bb7
+  ^bb7:  // 2 preds: ^bb5, ^bb6
+    util.global.store %3, @__device_0 : !hal.device
+    util.return
+  }
+  util.global private @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  util.initializer {
+    %__device_0 = util.global.load @__device_0 : !hal.device
+    %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+    util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+    util.return
+  }
+  util.global private @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  util.initializer {
+    %c-1_i64 = arith.constant -1 : i64
+    %c-1 = arith.constant -1 : index
+    %c0 = arith.constant 0 : index
+    %c14_i32 = arith.constant 14 : i32
+    %0 = util.null : !hal.executable
+    %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+    %__device_0 = util.global.load @__device_0 : !hal.device
+    %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index
+    %2 = arith.cmpi eq, %1, %c0 : index
+    cf.cond_br %2, ^bb1, ^bb2
+  ^bb1:  // pred: ^bb0
+    %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable
+    cf.br ^bb3(%executable : !hal.executable)
+  ^bb2:  // pred: ^bb0
+    util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+    cf.br ^bb3(%0 : !hal.executable)
+  ^bb3(%3: !hal.executable):  // 2 preds: ^bb1, ^bb2
+    util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+    util.return
+  }
+  hal.executable private @multiple_results_dispatch_0 {
+    hal.executable.binary public @embedded_elf_arm_64 attributes {data = dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8>, format = "embedded-elf-arm_64", mime_type = "application/x-elf"}
+  }
+  util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} {
+    %c-1_i64 = arith.constant -1 : i64
+    %c3 = arith.constant 3 : index
+    %c0_i32 = arith.constant 0 : i32
+    %c0 = arith.constant 0 : index
+    %c2 = arith.constant 2 : index
+    %c8 = arith.constant 8 : index
+    %c128 = arith.constant 128 : index
+    %c64_i32 = arith.constant 64 : i32
+    %c1 = arith.constant 1 : index
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([
+      (%c0 : index)[%c0, %c8], 
+      (%c2 : index)[%c0, %c128]
+    ]) flags("None")
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([
+      (%c1 : index)[%c0, %c8], 
+      (%c2 : index)[%c0, %c128]
+    ]) flags("None")
+    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
+    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
+    util.return %cmd : !hal.command_buffer
+  }
+  util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  util.initializer {
+    %0 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer
+    util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    util.return
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c-1_i32 = arith.constant -1 : i32
+    %c0_i64 = arith.constant 0 : i64
+    %0 = util.null : !hal.fence
+    %c-1_i64 = arith.constant -1 : i64
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c64 = arith.constant 64 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32
+    %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
+    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
+    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
+    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128}
+    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+    hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%__multiple_results_memoize_result_0_device_0) bindings([
+      (%buffer : !hal.buffer)[%c0, %c8], 
+      (%buffer_0 : !hal.buffer)[%c0, %c8], 
+      (%transient_buffer : !hal.buffer)[%c0, %c128]
+    ]) flags("None")
+    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32
+    util.status.check_ok %status, "failed to wait on timepoint"
+    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+    %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+    util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+util.initializer {
+  %c18_i32 = arith.constant 18 : i32
+  %false = arith.constant false
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %0 = util.null : !hal.device
+  %device_count = hal.devices.count : index
+  cf.br ^bb1(%c0, %c0, %0 : index, index, !hal.device)
+^bb1(%1: index, %2: index, %3: !hal.device):  // 2 preds: ^bb0, ^bb4
+  %4 = util.cmp.eq %3, %0 : !hal.device
+  %5 = arith.cmpi slt, %1, %device_count : index
+  %6 = arith.andi %4, %5 : i1
+  cf.cond_br %6, ^bb2, ^bb5
+^bb2:  // pred: ^bb1
+  %device_n = hal.devices.get %1 : !hal.device
+  %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
+  cf.cond_br %value, ^bb3, ^bb4(%false : i1)
+^bb3:  // pred: ^bb2
+  %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+  cf.br ^bb4(%value_1 : i1)
+^bb4(%7: i1):  // 2 preds: ^bb2, ^bb3
+  %8 = arith.cmpi eq, %2, %c0 : index
+  %9 = arith.select %7, %c1, %c0 : index
+  %10 = arith.addi %2, %9 : index
+  %11 = arith.andi %7, %8 : i1
+  %12 = arith.select %11, %device_n, %0 : !hal.device
+  %13 = arith.addi %1, %c1 : index
+  cf.br ^bb1(%13, %10, %12 : index, index, !hal.device)
+^bb5:  // pred: ^bb1
+  cf.cond_br %4, ^bb6, ^bb7
+^bb6:  // pred: ^bb5
+  util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>"
+  cf.br ^bb7
+^bb7:  // 2 preds: ^bb5, ^bb6
+  util.global.store %3, @__device_0 : !hal.device
+  util.return
+}
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+util.initializer {
+  %__device_0 = util.global.load @__device_0 : !hal.device
+  %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+  util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  util.return
+}
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+util.initializer {
+  %c-1_i64 = arith.constant -1 : i64
+  %c-1 = arith.constant -1 : index
+  %c0 = arith.constant 0 : index
+  %c14_i32 = arith.constant 14 : i32
+  %0 = util.null : !hal.executable
+  %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  %__device_0 = util.global.load @__device_0 : !hal.device
+  %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index
+  %2 = arith.cmpi eq, %1, %c0 : index
+  cf.cond_br %2, ^bb1, ^bb2
+^bb1:  // pred: ^bb0
+  %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable
+  cf.br ^bb3(%executable : !hal.executable)
+^bb2:  // pred: ^bb0
+  util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+  cf.br ^bb3(%0 : !hal.executable)
+^bb3(%3: !hal.executable):  // 2 preds: ^bb1, ^bb2
+  util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  util.return
+}
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} {
+  %c-1_i64 = arith.constant -1 : i64
+  %c3 = arith.constant 3 : index
+  %c0_i32 = arith.constant 0 : i32
+  %c0 = arith.constant 0 : index
+  %c2 = arith.constant 2 : index
+  %c8 = arith.constant 8 : index
+  %c128 = arith.constant 128 : index
+  %c64_i32 = arith.constant 64 : i32
+  %c1 = arith.constant 1 : index
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer
+  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([
+    (%c0 : index)[%c0, %c8], 
+    (%c2 : index)[%c0, %c128]
+  ]) flags("None")
+  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([
+    (%c1 : index)[%c0, %c8], 
+    (%c2 : index)[%c0, %c128]
+  ]) flags("None")
+  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
+  hal.command_buffer.finalize<%cmd : !hal.command_buffer>
+  util.return %cmd : !hal.command_buffer
+}
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+util.initializer {
+  %0 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer
+  util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  util.return
+}
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c-1_i32 = arith.constant -1 : i32
+  %c0_i64 = arith.constant 0 : i64
+  %0 = util.null : !hal.fence
+  %c-1_i64 = arith.constant -1 : i64
+  %c0 = arith.constant 0 : index
+  %c128 = arith.constant 128 : index
+  %c64 = arith.constant 64 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32
+  %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
+  %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
+  hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
+  hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+  %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+  %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128}
+  %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+  hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%__multiple_results_memoize_result_0_device_0) bindings([
+    (%buffer : !hal.buffer)[%c0, %c8], 
+    (%buffer_0 : !hal.buffer)[%c0, %c8], 
+    (%transient_buffer : !hal.buffer)[%c0, %c128]
+  ]) flags("None")
+  %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32
+  util.status.check_ok %status, "failed to wait on timepoint"
+  %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+  %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+  util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After Inliner (inline) //----- //
+module {
+  util.global private @__device_0 : !hal.device
+  util.initializer {
+    %c18_i32 = arith.constant 18 : i32
+    %false = arith.constant false
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %0 = util.null : !hal.device
+    %device_count = hal.devices.count : index
+    cf.br ^bb1(%c0, %c0, %0 : index, index, !hal.device)
+  ^bb1(%1: index, %2: index, %3: !hal.device):  // 2 preds: ^bb0, ^bb4
+    %4 = util.cmp.eq %3, %0 : !hal.device
+    %5 = arith.cmpi slt, %1, %device_count : index
+    %6 = arith.andi %4, %5 : i1
+    cf.cond_br %6, ^bb2, ^bb5
+  ^bb2:  // pred: ^bb1
+    %device_n = hal.devices.get %1 : !hal.device
+    %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
+    cf.cond_br %value, ^bb3, ^bb4(%false : i1)
+  ^bb3:  // pred: ^bb2
+    %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+    cf.br ^bb4(%value_1 : i1)
+  ^bb4(%7: i1):  // 2 preds: ^bb2, ^bb3
+    %8 = arith.cmpi eq, %2, %c0 : index
+    %9 = arith.select %7, %c1, %c0 : index
+    %10 = arith.addi %2, %9 : index
+    %11 = arith.andi %7, %8 : i1
+    %12 = arith.select %11, %device_n, %0 : !hal.device
+    %13 = arith.addi %1, %c1 : index
+    cf.br ^bb1(%13, %10, %12 : index, index, !hal.device)
+  ^bb5:  // pred: ^bb1
+    cf.cond_br %4, ^bb6, ^bb7
+  ^bb6:  // pred: ^bb5
+    util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>"
+    cf.br ^bb7
+  ^bb7:  // 2 preds: ^bb5, ^bb6
+    util.global.store %3, @__device_0 : !hal.device
+    util.return
+  }
+  util.global private @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  util.initializer {
+    %__device_0 = util.global.load @__device_0 : !hal.device
+    %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+    util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+    util.return
+  }
+  util.global private @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  util.initializer {
+    %c-1_i64 = arith.constant -1 : i64
+    %c-1 = arith.constant -1 : index
+    %c0 = arith.constant 0 : index
+    %c14_i32 = arith.constant 14 : i32
+    %0 = util.null : !hal.executable
+    %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+    %__device_0 = util.global.load @__device_0 : !hal.device
+    %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index
+    %2 = arith.cmpi eq, %1, %c0 : index
+    cf.cond_br %2, ^bb1, ^bb2
+  ^bb1:  // pred: ^bb0
+    %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable
+    cf.br ^bb3(%executable : !hal.executable)
+  ^bb2:  // pred: ^bb0
+    util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+    cf.br ^bb3(%0 : !hal.executable)
+  ^bb3(%3: !hal.executable):  // 2 preds: ^bb1, ^bb2
+    util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+    util.return
+  }
+  hal.executable private @multiple_results_dispatch_0 {
+    hal.executable.binary public @embedded_elf_arm_64 attributes {data = dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8>, format = "embedded-elf-arm_64", mime_type = "application/x-elf"}
+  }
+  util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} {
+    %c-1_i64 = arith.constant -1 : i64
+    %c3 = arith.constant 3 : index
+    %c0_i32 = arith.constant 0 : i32
+    %c0 = arith.constant 0 : index
+    %c2 = arith.constant 2 : index
+    %c8 = arith.constant 8 : index
+    %c128 = arith.constant 128 : index
+    %c64_i32 = arith.constant 64 : i32
+    %c1 = arith.constant 1 : index
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([
+      (%c0 : index)[%c0, %c8], 
+      (%c2 : index)[%c0, %c128]
+    ]) flags("None")
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([
+      (%c1 : index)[%c0, %c8], 
+      (%c2 : index)[%c0, %c128]
+    ]) flags("None")
+    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
+    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
+    util.return %cmd : !hal.command_buffer
+  }
+  util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  util.initializer {
+    %0 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer
+    util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    util.return
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c-1_i32 = arith.constant -1 : i32
+    %c0_i64 = arith.constant 0 : i64
+    %0 = util.null : !hal.fence
+    %c-1_i64 = arith.constant -1 : i64
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c64 = arith.constant 64 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32
+    %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
+    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
+    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
+    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128}
+    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+    hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%__multiple_results_memoize_result_0_device_0) bindings([
+      (%buffer : !hal.buffer)[%c0, %c8], 
+      (%buffer_0 : !hal.buffer)[%c0, %c8], 
+      (%transient_buffer : !hal.buffer)[%c0, %c128]
+    ]) flags("None")
+    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32
+    util.status.check_ok %status, "failed to wait on timepoint"
+    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+    %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+    util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After SymbolDCE (symbol-dce) //----- //
+module {
+  util.global private @__device_0 : !hal.device
+  util.initializer {
+    %c18_i32 = arith.constant 18 : i32
+    %false = arith.constant false
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %0 = util.null : !hal.device
+    %device_count = hal.devices.count : index
+    cf.br ^bb1(%c0, %c0, %0 : index, index, !hal.device)
+  ^bb1(%1: index, %2: index, %3: !hal.device):  // 2 preds: ^bb0, ^bb4
+    %4 = util.cmp.eq %3, %0 : !hal.device
+    %5 = arith.cmpi slt, %1, %device_count : index
+    %6 = arith.andi %4, %5 : i1
+    cf.cond_br %6, ^bb2, ^bb5
+  ^bb2:  // pred: ^bb1
+    %device_n = hal.devices.get %1 : !hal.device
+    %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
+    cf.cond_br %value, ^bb3, ^bb4(%false : i1)
+  ^bb3:  // pred: ^bb2
+    %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+    cf.br ^bb4(%value_1 : i1)
+  ^bb4(%7: i1):  // 2 preds: ^bb2, ^bb3
+    %8 = arith.cmpi eq, %2, %c0 : index
+    %9 = arith.select %7, %c1, %c0 : index
+    %10 = arith.addi %2, %9 : index
+    %11 = arith.andi %7, %8 : i1
+    %12 = arith.select %11, %device_n, %0 : !hal.device
+    %13 = arith.addi %1, %c1 : index
+    cf.br ^bb1(%13, %10, %12 : index, index, !hal.device)
+  ^bb5:  // pred: ^bb1
+    cf.cond_br %4, ^bb6, ^bb7
+  ^bb6:  // pred: ^bb5
+    util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>"
+    cf.br ^bb7
+  ^bb7:  // 2 preds: ^bb5, ^bb6
+    util.global.store %3, @__device_0 : !hal.device
+    util.return
+  }
+  util.global private @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  util.initializer {
+    %__device_0 = util.global.load @__device_0 : !hal.device
+    %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+    util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+    util.return
+  }
+  util.global private @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  util.initializer {
+    %c-1_i64 = arith.constant -1 : i64
+    %c-1 = arith.constant -1 : index
+    %c0 = arith.constant 0 : index
+    %c14_i32 = arith.constant 14 : i32
+    %0 = util.null : !hal.executable
+    %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+    %__device_0 = util.global.load @__device_0 : !hal.device
+    %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index
+    %2 = arith.cmpi eq, %1, %c0 : index
+    cf.cond_br %2, ^bb1, ^bb2
+  ^bb1:  // pred: ^bb0
+    %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable
+    cf.br ^bb3(%executable : !hal.executable)
+  ^bb2:  // pred: ^bb0
+    util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+    cf.br ^bb3(%0 : !hal.executable)
+  ^bb3(%3: !hal.executable):  // 2 preds: ^bb1, ^bb2
+    util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+    util.return
+  }
+  hal.executable private @multiple_results_dispatch_0 {
+    hal.executable.binary public @embedded_elf_arm_64 attributes {data = dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8>, format = "embedded-elf-arm_64", mime_type = "application/x-elf"}
+  }
+  util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} {
+    %c-1_i64 = arith.constant -1 : i64
+    %c3 = arith.constant 3 : index
+    %c0_i32 = arith.constant 0 : i32
+    %c0 = arith.constant 0 : index
+    %c2 = arith.constant 2 : index
+    %c8 = arith.constant 8 : index
+    %c128 = arith.constant 128 : index
+    %c64_i32 = arith.constant 64 : i32
+    %c1 = arith.constant 1 : index
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([
+      (%c0 : index)[%c0, %c8], 
+      (%c2 : index)[%c0, %c128]
+    ]) flags("None")
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([
+      (%c1 : index)[%c0, %c8], 
+      (%c2 : index)[%c0, %c128]
+    ]) flags("None")
+    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
+    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
+    util.return %cmd : !hal.command_buffer
+  }
+  util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  util.initializer {
+    %0 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer
+    util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    util.return
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c-1_i32 = arith.constant -1 : i32
+    %c0_i64 = arith.constant 0 : i64
+    %0 = util.null : !hal.fence
+    %c-1_i64 = arith.constant -1 : i64
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c64 = arith.constant 64 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32
+    %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
+    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
+    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
+    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128}
+    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+    hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%__multiple_results_memoize_result_0_device_0) bindings([
+      (%buffer : !hal.buffer)[%c0, %c8], 
+      (%buffer_0 : !hal.buffer)[%c0, %c8], 
+      (%transient_buffer : !hal.buffer)[%c0, %c128]
+    ]) flags("None")
+    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32
+    util.status.check_ok %status, "failed to wait on timepoint"
+    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+    %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+    util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After VerifyInitializationOrderPass (iree-util-verify-initialization-order) //----- //
+module {
+  util.global private @__device_0 : !hal.device
+  util.initializer {
+    %c18_i32 = arith.constant 18 : i32
+    %false = arith.constant false
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %0 = util.null : !hal.device
+    %device_count = hal.devices.count : index
+    cf.br ^bb1(%c0, %c0, %0 : index, index, !hal.device)
+  ^bb1(%1: index, %2: index, %3: !hal.device):  // 2 preds: ^bb0, ^bb4
+    %4 = util.cmp.eq %3, %0 : !hal.device
+    %5 = arith.cmpi slt, %1, %device_count : index
+    %6 = arith.andi %4, %5 : i1
+    cf.cond_br %6, ^bb2, ^bb5
+  ^bb2:  // pred: ^bb1
+    %device_n = hal.devices.get %1 : !hal.device
+    %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
+    cf.cond_br %value, ^bb3, ^bb4(%false : i1)
+  ^bb3:  // pred: ^bb2
+    %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+    cf.br ^bb4(%value_1 : i1)
+  ^bb4(%7: i1):  // 2 preds: ^bb2, ^bb3
+    %8 = arith.cmpi eq, %2, %c0 : index
+    %9 = arith.select %7, %c1, %c0 : index
+    %10 = arith.addi %2, %9 : index
+    %11 = arith.andi %7, %8 : i1
+    %12 = arith.select %11, %device_n, %0 : !hal.device
+    %13 = arith.addi %1, %c1 : index
+    cf.br ^bb1(%13, %10, %12 : index, index, !hal.device)
+  ^bb5:  // pred: ^bb1
+    cf.cond_br %4, ^bb6, ^bb7
+  ^bb6:  // pred: ^bb5
+    util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>"
+    cf.br ^bb7
+  ^bb7:  // 2 preds: ^bb5, ^bb6
+    util.global.store %3, @__device_0 : !hal.device
+    util.return
+  }
+  util.global private @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  util.initializer {
+    %__device_0 = util.global.load @__device_0 : !hal.device
+    %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+    util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+    util.return
+  }
+  util.global private @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  util.initializer {
+    %c-1_i64 = arith.constant -1 : i64
+    %c-1 = arith.constant -1 : index
+    %c0 = arith.constant 0 : index
+    %c14_i32 = arith.constant 14 : i32
+    %0 = util.null : !hal.executable
+    %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+    %__device_0 = util.global.load @__device_0 : !hal.device
+    %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index
+    %2 = arith.cmpi eq, %1, %c0 : index
+    cf.cond_br %2, ^bb1, ^bb2
+  ^bb1:  // pred: ^bb0
+    %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable
+    cf.br ^bb3(%executable : !hal.executable)
+  ^bb2:  // pred: ^bb0
+    util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+    cf.br ^bb3(%0 : !hal.executable)
+  ^bb3(%3: !hal.executable):  // 2 preds: ^bb1, ^bb2
+    util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+    util.return
+  }
+  hal.executable private @multiple_results_dispatch_0 {
+    hal.executable.binary public @embedded_elf_arm_64 attributes {data = dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8>, format = "embedded-elf-arm_64", mime_type = "application/x-elf"}
+  }
+  util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} {
+    %c-1_i64 = arith.constant -1 : i64
+    %c3 = arith.constant 3 : index
+    %c0_i32 = arith.constant 0 : i32
+    %c0 = arith.constant 0 : index
+    %c2 = arith.constant 2 : index
+    %c8 = arith.constant 8 : index
+    %c128 = arith.constant 128 : index
+    %c64_i32 = arith.constant 64 : i32
+    %c1 = arith.constant 1 : index
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([
+      (%c0 : index)[%c0, %c8], 
+      (%c2 : index)[%c0, %c128]
+    ]) flags("None")
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([
+      (%c1 : index)[%c0, %c8], 
+      (%c2 : index)[%c0, %c128]
+    ]) flags("None")
+    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
+    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
+    util.return %cmd : !hal.command_buffer
+  }
+  util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  util.initializer {
+    %0 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer
+    util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    util.return
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c-1_i32 = arith.constant -1 : i32
+    %c0_i64 = arith.constant 0 : i64
+    %0 = util.null : !hal.fence
+    %c-1_i64 = arith.constant -1 : i64
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c64 = arith.constant 64 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32
+    %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
+    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
+    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
+    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128}
+    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+    hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%__multiple_results_memoize_result_0_device_0) bindings([
+      (%buffer : !hal.buffer)[%c0, %c8], 
+      (%buffer_0 : !hal.buffer)[%c0, %c8], 
+      (%transient_buffer : !hal.buffer)[%c0, %c128]
+    ]) flags("None")
+    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32
+    util.status.check_ok %status, "failed to wait on timepoint"
+    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+    %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+    util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After CombineInitializersPass (iree-util-combine-initializers) //----- //
+module {
+  util.global private @__device_0 : !hal.device
+  util.global private @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  util.global private @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  util.initializer {
+    %c18_i32 = arith.constant 18 : i32
+    %false = arith.constant false
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %0 = util.null : !hal.device
+    %device_count = hal.devices.count : index
+    cf.br ^bb1(%c0, %c0, %0 : index, index, !hal.device)
+  ^bb1(%1: index, %2: index, %3: !hal.device):  // 2 preds: ^bb0, ^bb4
+    %4 = util.cmp.eq %3, %0 : !hal.device
+    %5 = arith.cmpi slt, %1, %device_count : index
+    %6 = arith.andi %4, %5 : i1
+    cf.cond_br %6, ^bb2, ^bb5
+  ^bb2:  // pred: ^bb1
+    %device_n = hal.devices.get %1 : !hal.device
+    %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
+    cf.cond_br %value, ^bb3, ^bb4(%false : i1)
+  ^bb3:  // pred: ^bb2
+    %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+    cf.br ^bb4(%value_1 : i1)
+  ^bb4(%7: i1):  // 2 preds: ^bb2, ^bb3
+    %8 = arith.cmpi eq, %2, %c0 : index
+    %9 = arith.select %7, %c1, %c0 : index
+    %10 = arith.addi %2, %9 : index
+    %11 = arith.andi %7, %8 : i1
+    %12 = arith.select %11, %device_n, %0 : !hal.device
+    %13 = arith.addi %1, %c1 : index
+    cf.br ^bb1(%13, %10, %12 : index, index, !hal.device)
+  ^bb5:  // pred: ^bb1
+    cf.cond_br %4, ^bb6, ^bb7
+  ^bb6:  // pred: ^bb5
+    util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>"
+    cf.br ^bb7
+  ^bb7:  // 2 preds: ^bb5, ^bb6
+    util.global.store %3, @__device_0 : !hal.device
+    cf.br ^bb8
+  ^bb8:  // pred: ^bb7
+    %__device_0 = util.global.load @__device_0 : !hal.device
+    %ok_2, %value_3 = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+    util.global.store %value_3, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+    %c-1_i64 = arith.constant -1 : i64
+    %c-1 = arith.constant -1 : index
+    %c0_4 = arith.constant 0 : index
+    %c14_i32 = arith.constant 14 : i32
+    %14 = util.null : !hal.executable
+    %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+    %__device_0_5 = util.global.load @__device_0 : !hal.device
+    %15 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0_4, %c-1 : index
+    %16 = arith.cmpi eq, %15, %c0_4 : index
+    cf.cond_br %16, ^bb9, ^bb10
+  ^bb9:  // pred: ^bb8
+    %executable = hal.executable.create device(%__device_0_5 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable
+    cf.br ^bb11(%executable : !hal.executable)
+  ^bb10:  // pred: ^bb8
+    util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+    cf.br ^bb11(%14 : !hal.executable)
+  ^bb11(%17: !hal.executable):  // 2 preds: ^bb9, ^bb10
+    util.global.store %17, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+    cf.br ^bb12
+  ^bb12:  // pred: ^bb11
+    %18 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer
+    util.global.store %18, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    util.return
+  }
+  hal.executable private @multiple_results_dispatch_0 {
+    hal.executable.binary public @embedded_elf_arm_64 attributes {data = dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8>, format = "embedded-elf-arm_64", mime_type = "application/x-elf"}
+  }
+  util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} {
+    %c-1_i64 = arith.constant -1 : i64
+    %c3 = arith.constant 3 : index
+    %c0_i32 = arith.constant 0 : i32
+    %c0 = arith.constant 0 : index
+    %c2 = arith.constant 2 : index
+    %c8 = arith.constant 8 : index
+    %c128 = arith.constant 128 : index
+    %c64_i32 = arith.constant 64 : i32
+    %c1 = arith.constant 1 : index
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([
+      (%c0 : index)[%c0, %c8], 
+      (%c2 : index)[%c0, %c128]
+    ]) flags("None")
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([
+      (%c1 : index)[%c0, %c8], 
+      (%c2 : index)[%c0, %c128]
+    ]) flags("None")
+    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
+    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
+    util.return %cmd : !hal.command_buffer
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c-1_i32 = arith.constant -1 : i32
+    %c0_i64 = arith.constant 0 : i64
+    %0 = util.null : !hal.fence
+    %c-1_i64 = arith.constant -1 : i64
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c64 = arith.constant 64 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32
+    %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
+    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
+    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
+    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128}
+    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+    hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%__multiple_results_memoize_result_0_device_0) bindings([
+      (%buffer : !hal.buffer)[%c0, %c8], 
+      (%buffer_0 : !hal.buffer)[%c0, %c8], 
+      (%transient_buffer : !hal.buffer)[%c0, %c128]
+    ]) flags("None")
+    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32
+    util.status.check_ok %status, "failed to wait on timepoint"
+    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+    %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+    util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After SCFForLoopCanonicalization (scf-for-loop-canonicalization) //----- //
+util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} {
+  %c-1_i64 = arith.constant -1 : i64
+  %c3 = arith.constant 3 : index
+  %c0_i32 = arith.constant 0 : i32
+  %c0 = arith.constant 0 : index
+  %c2 = arith.constant 2 : index
+  %c8 = arith.constant 8 : index
+  %c128 = arith.constant 128 : index
+  %c64_i32 = arith.constant 64 : i32
+  %c1 = arith.constant 1 : index
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer
+  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([
+    (%c0 : index)[%c0, %c8], 
+    (%c2 : index)[%c0, %c128]
+  ]) flags("None")
+  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([
+    (%c1 : index)[%c0, %c8], 
+    (%c2 : index)[%c0, %c128]
+  ]) flags("None")
+  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
+  hal.command_buffer.finalize<%cmd : !hal.command_buffer>
+  util.return %cmd : !hal.command_buffer
+}
+
+// -----// IR Dump After LoopInvariantCodeMotion (loop-invariant-code-motion) //----- //
+util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} {
+  %c-1_i64 = arith.constant -1 : i64
+  %c3 = arith.constant 3 : index
+  %c0_i32 = arith.constant 0 : i32
+  %c0 = arith.constant 0 : index
+  %c2 = arith.constant 2 : index
+  %c8 = arith.constant 8 : index
+  %c128 = arith.constant 128 : index
+  %c64_i32 = arith.constant 64 : i32
+  %c1 = arith.constant 1 : index
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer
+  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([
+    (%c0 : index)[%c0, %c8], 
+    (%c2 : index)[%c0, %c128]
+  ]) flags("None")
+  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([
+    (%c1 : index)[%c0, %c8], 
+    (%c2 : index)[%c0, %c128]
+  ]) flags("None")
+  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
+  hal.command_buffer.finalize<%cmd : !hal.command_buffer>
+  util.return %cmd : !hal.command_buffer
+}
+
+// -----// IR Dump After SCFForLoopCanonicalization (scf-for-loop-canonicalization) //----- //
+util.initializer {
+  %0 = util.null : !hal.executable
+  %c14_i32 = arith.constant 14 : i32
+  %c-1 = arith.constant -1 : index
+  %c-1_i64 = arith.constant -1 : i64
+  %c18_i32 = arith.constant 18 : i32
+  %false = arith.constant false
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %1 = util.null : !hal.device
+  %device_count = hal.devices.count : index
+  cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device)
+^bb1(%2: index, %3: index, %4: !hal.device):  // 2 preds: ^bb0, ^bb4
+  %5 = util.cmp.eq %4, %1 : !hal.device
+  %6 = arith.cmpi slt, %2, %device_count : index
+  %7 = arith.andi %5, %6 : i1
+  cf.cond_br %7, ^bb2, ^bb5
+^bb2:  // pred: ^bb1
+  %device_n = hal.devices.get %2 : !hal.device
+  %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
+  cf.cond_br %value, ^bb3, ^bb4(%false : i1)
+^bb3:  // pred: ^bb2
+  %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+  cf.br ^bb4(%value_1 : i1)
+^bb4(%8: i1):  // 2 preds: ^bb2, ^bb3
+  %9 = arith.cmpi eq, %3, %c0 : index
+  %10 = arith.select %8, %c1, %c0 : index
+  %11 = arith.addi %3, %10 : index
+  %12 = arith.andi %8, %9 : i1
+  %13 = arith.select %12, %device_n, %1 : !hal.device
+  %14 = arith.addi %2, %c1 : index
+  cf.br ^bb1(%14, %11, %13 : index, index, !hal.device)
+^bb5:  // pred: ^bb1
+  cf.cond_br %5, ^bb6, ^bb7
+^bb6:  // pred: ^bb5
+  util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>"
+  cf.br ^bb7
+^bb7:  // 2 preds: ^bb5, ^bb6
+  util.global.store %4, @__device_0 : !hal.device
+  cf.br ^bb8
+^bb8:  // pred: ^bb7
+  %__device_0 = util.global.load @__device_0 : !hal.device
+  %ok_2, %value_3 = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+  util.global.store %value_3, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  %__device_0_4 = util.global.load @__device_0 : !hal.device
+  %15 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index
+  %16 = arith.cmpi eq, %15, %c0 : index
+  cf.cond_br %16, ^bb9, ^bb10
+^bb9:  // pred: ^bb8
+  %executable = hal.executable.create device(%__device_0_4 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable
+  cf.br ^bb11(%executable : !hal.executable)
+^bb10:  // pred: ^bb8
+  util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+  cf.br ^bb11(%0 : !hal.executable)
+^bb11(%17: !hal.executable):  // 2 preds: ^bb9, ^bb10
+  util.global.store %17, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  cf.br ^bb12
+^bb12:  // pred: ^bb11
+  %18 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer
+  util.global.store %18, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  util.return
+}
+
+// -----// IR Dump After SCFForLoopCanonicalization (scf-for-loop-canonicalization) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c-1_i32 = arith.constant -1 : i32
+  %c0_i64 = arith.constant 0 : i64
+  %0 = util.null : !hal.fence
+  %c-1_i64 = arith.constant -1 : i64
+  %c0 = arith.constant 0 : index
+  %c128 = arith.constant 128 : index
+  %c64 = arith.constant 64 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32
+  %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
+  %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
+  hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
+  hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+  %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+  %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128}
+  %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+  hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%__multiple_results_memoize_result_0_device_0) bindings([
+    (%buffer : !hal.buffer)[%c0, %c8], 
+    (%buffer_0 : !hal.buffer)[%c0, %c8], 
+    (%transient_buffer : !hal.buffer)[%c0, %c128]
+  ]) flags("None")
+  %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32
+  util.status.check_ok %status, "failed to wait on timepoint"
+  %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+  %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+  util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After SCFToControlFlowPass (convert-scf-to-cf) //----- //
+util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} {
+  %c-1_i64 = arith.constant -1 : i64
+  %c3 = arith.constant 3 : index
+  %c0_i32 = arith.constant 0 : i32
+  %c0 = arith.constant 0 : index
+  %c2 = arith.constant 2 : index
+  %c8 = arith.constant 8 : index
+  %c128 = arith.constant 128 : index
+  %c64_i32 = arith.constant 64 : i32
+  %c1 = arith.constant 1 : index
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer
+  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([
+    (%c0 : index)[%c0, %c8], 
+    (%c2 : index)[%c0, %c128]
+  ]) flags("None")
+  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([
+    (%c1 : index)[%c0, %c8], 
+    (%c2 : index)[%c0, %c128]
+  ]) flags("None")
+  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
+  hal.command_buffer.finalize<%cmd : !hal.command_buffer>
+  util.return %cmd : !hal.command_buffer
+}
+
+// -----// IR Dump After LoopInvariantCodeMotion (loop-invariant-code-motion) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c-1_i32 = arith.constant -1 : i32
+  %c0_i64 = arith.constant 0 : i64
+  %0 = util.null : !hal.fence
+  %c-1_i64 = arith.constant -1 : i64
+  %c0 = arith.constant 0 : index
+  %c128 = arith.constant 128 : index
+  %c64 = arith.constant 64 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32
+  %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
+  %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
+  hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
+  hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+  %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+  %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128}
+  %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+  hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%__multiple_results_memoize_result_0_device_0) bindings([
+    (%buffer : !hal.buffer)[%c0, %c8], 
+    (%buffer_0 : !hal.buffer)[%c0, %c8], 
+    (%transient_buffer : !hal.buffer)[%c0, %c128]
+  ]) flags("None")
+  %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32
+  util.status.check_ok %status, "failed to wait on timepoint"
+  %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+  %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+  util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After IREECodegenAffineExpandIndexOpsPass (iree-codegen-affine-expand-index-ops) //----- //
+util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} {
+  %c-1_i64 = arith.constant -1 : i64
+  %c3 = arith.constant 3 : index
+  %c0_i32 = arith.constant 0 : i32
+  %c0 = arith.constant 0 : index
+  %c2 = arith.constant 2 : index
+  %c8 = arith.constant 8 : index
+  %c128 = arith.constant 128 : index
+  %c64_i32 = arith.constant 64 : i32
+  %c1 = arith.constant 1 : index
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer
+  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([
+    (%c0 : index)[%c0, %c8], 
+    (%c2 : index)[%c0, %c128]
+  ]) flags("None")
+  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([
+    (%c1 : index)[%c0, %c8], 
+    (%c2 : index)[%c0, %c128]
+  ]) flags("None")
+  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
+  hal.command_buffer.finalize<%cmd : !hal.command_buffer>
+  util.return %cmd : !hal.command_buffer
+}
+
+// -----// IR Dump After LoopInvariantCodeMotion (loop-invariant-code-motion) //----- //
+util.initializer {
+  %0 = util.null : !hal.executable
+  %c14_i32 = arith.constant 14 : i32
+  %c-1 = arith.constant -1 : index
+  %c-1_i64 = arith.constant -1 : i64
+  %c18_i32 = arith.constant 18 : i32
+  %false = arith.constant false
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %1 = util.null : !hal.device
+  %device_count = hal.devices.count : index
+  cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device)
+^bb1(%2: index, %3: index, %4: !hal.device):  // 2 preds: ^bb0, ^bb4
+  %5 = util.cmp.eq %4, %1 : !hal.device
+  %6 = arith.cmpi slt, %2, %device_count : index
+  %7 = arith.andi %5, %6 : i1
+  cf.cond_br %7, ^bb2, ^bb5
+^bb2:  // pred: ^bb1
+  %device_n = hal.devices.get %2 : !hal.device
+  %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
+  cf.cond_br %value, ^bb3, ^bb4(%false : i1)
+^bb3:  // pred: ^bb2
+  %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+  cf.br ^bb4(%value_1 : i1)
+^bb4(%8: i1):  // 2 preds: ^bb2, ^bb3
+  %9 = arith.cmpi eq, %3, %c0 : index
+  %10 = arith.select %8, %c1, %c0 : index
+  %11 = arith.addi %3, %10 : index
+  %12 = arith.andi %8, %9 : i1
+  %13 = arith.select %12, %device_n, %1 : !hal.device
+  %14 = arith.addi %2, %c1 : index
+  cf.br ^bb1(%14, %11, %13 : index, index, !hal.device)
+^bb5:  // pred: ^bb1
+  cf.cond_br %5, ^bb6, ^bb7
+^bb6:  // pred: ^bb5
+  util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>"
+  cf.br ^bb7
+^bb7:  // 2 preds: ^bb5, ^bb6
+  util.global.store %4, @__device_0 : !hal.device
+  cf.br ^bb8
+^bb8:  // pred: ^bb7
+  %__device_0 = util.global.load @__device_0 : !hal.device
+  %ok_2, %value_3 = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+  util.global.store %value_3, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  %__device_0_4 = util.global.load @__device_0 : !hal.device
+  %15 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index
+  %16 = arith.cmpi eq, %15, %c0 : index
+  cf.cond_br %16, ^bb9, ^bb10
+^bb9:  // pred: ^bb8
+  %executable = hal.executable.create device(%__device_0_4 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable
+  cf.br ^bb11(%executable : !hal.executable)
+^bb10:  // pred: ^bb8
+  util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+  cf.br ^bb11(%0 : !hal.executable)
+^bb11(%17: !hal.executable):  // 2 preds: ^bb9, ^bb10
+  util.global.store %17, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  cf.br ^bb12
+^bb12:  // pred: ^bb11
+  %18 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer
+  util.global.store %18, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  util.return
+}
+
+// -----// IR Dump After SCFToControlFlowPass (convert-scf-to-cf) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c-1_i32 = arith.constant -1 : i32
+  %c0_i64 = arith.constant 0 : i64
+  %0 = util.null : !hal.fence
+  %c-1_i64 = arith.constant -1 : i64
+  %c0 = arith.constant 0 : index
+  %c128 = arith.constant 128 : index
+  %c64 = arith.constant 64 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32
+  %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
+  %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
+  hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
+  hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+  %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+  %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128}
+  %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+  hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%__multiple_results_memoize_result_0_device_0) bindings([
+    (%buffer : !hal.buffer)[%c0, %c8], 
+    (%buffer_0 : !hal.buffer)[%c0, %c8], 
+    (%transient_buffer : !hal.buffer)[%c0, %c128]
+  ]) flags("None")
+  %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32
+  util.status.check_ok %status, "failed to wait on timepoint"
+  %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+  %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+  util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After IREECodegenLowerAffinePass (iree-codegen-lower-affine) //----- //
+util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} {
+  %c-1_i64 = arith.constant -1 : i64
+  %c3 = arith.constant 3 : index
+  %c0_i32 = arith.constant 0 : i32
+  %c0 = arith.constant 0 : index
+  %c2 = arith.constant 2 : index
+  %c8 = arith.constant 8 : index
+  %c128 = arith.constant 128 : index
+  %c64_i32 = arith.constant 64 : i32
+  %c1 = arith.constant 1 : index
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer
+  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([
+    (%c0 : index)[%c0, %c8], 
+    (%c2 : index)[%c0, %c128]
+  ]) flags("None")
+  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([
+    (%c1 : index)[%c0, %c8], 
+    (%c2 : index)[%c0, %c128]
+  ]) flags("None")
+  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
+  hal.command_buffer.finalize<%cmd : !hal.command_buffer>
+  util.return %cmd : !hal.command_buffer
+}
+
+// -----// IR Dump After SCFToControlFlowPass (convert-scf-to-cf) //----- //
+util.initializer {
+  %0 = util.null : !hal.executable
+  %c14_i32 = arith.constant 14 : i32
+  %c-1 = arith.constant -1 : index
+  %c-1_i64 = arith.constant -1 : i64
+  %c18_i32 = arith.constant 18 : i32
+  %false = arith.constant false
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %1 = util.null : !hal.device
+  %device_count = hal.devices.count : index
+  cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device)
+^bb1(%2: index, %3: index, %4: !hal.device):  // 2 preds: ^bb0, ^bb4
+  %5 = util.cmp.eq %4, %1 : !hal.device
+  %6 = arith.cmpi slt, %2, %device_count : index
+  %7 = arith.andi %5, %6 : i1
+  cf.cond_br %7, ^bb2, ^bb5
+^bb2:  // pred: ^bb1
+  %device_n = hal.devices.get %2 : !hal.device
+  %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
+  cf.cond_br %value, ^bb3, ^bb4(%false : i1)
+^bb3:  // pred: ^bb2
+  %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+  cf.br ^bb4(%value_1 : i1)
+^bb4(%8: i1):  // 2 preds: ^bb2, ^bb3
+  %9 = arith.cmpi eq, %3, %c0 : index
+  %10 = arith.select %8, %c1, %c0 : index
+  %11 = arith.addi %3, %10 : index
+  %12 = arith.andi %8, %9 : i1
+  %13 = arith.select %12, %device_n, %1 : !hal.device
+  %14 = arith.addi %2, %c1 : index
+  cf.br ^bb1(%14, %11, %13 : index, index, !hal.device)
+^bb5:  // pred: ^bb1
+  cf.cond_br %5, ^bb6, ^bb7
+^bb6:  // pred: ^bb5
+  util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>"
+  cf.br ^bb7
+^bb7:  // 2 preds: ^bb5, ^bb6
+  util.global.store %4, @__device_0 : !hal.device
+  cf.br ^bb8
+^bb8:  // pred: ^bb7
+  %__device_0 = util.global.load @__device_0 : !hal.device
+  %ok_2, %value_3 = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+  util.global.store %value_3, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  %__device_0_4 = util.global.load @__device_0 : !hal.device
+  %15 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index
+  %16 = arith.cmpi eq, %15, %c0 : index
+  cf.cond_br %16, ^bb9, ^bb10
+^bb9:  // pred: ^bb8
+  %executable = hal.executable.create device(%__device_0_4 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable
+  cf.br ^bb11(%executable : !hal.executable)
+^bb10:  // pred: ^bb8
+  util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+  cf.br ^bb11(%0 : !hal.executable)
+^bb11(%17: !hal.executable):  // 2 preds: ^bb9, ^bb10
+  util.global.store %17, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  cf.br ^bb12
+^bb12:  // pred: ^bb11
+  %18 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer
+  util.global.store %18, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  util.return
+}
+
+// -----// IR Dump After IREECodegenAffineExpandIndexOpsPass (iree-codegen-affine-expand-index-ops) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c-1_i32 = arith.constant -1 : i32
+  %c0_i64 = arith.constant 0 : i64
+  %0 = util.null : !hal.fence
+  %c-1_i64 = arith.constant -1 : i64
+  %c0 = arith.constant 0 : index
+  %c128 = arith.constant 128 : index
+  %c64 = arith.constant 64 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32
+  %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
+  %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
+  hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
+  hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+  %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+  %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128}
+  %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+  hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%__multiple_results_memoize_result_0_device_0) bindings([
+    (%buffer : !hal.buffer)[%c0, %c8], 
+    (%buffer_0 : !hal.buffer)[%c0, %c8], 
+    (%transient_buffer : !hal.buffer)[%c0, %c128]
+  ]) flags("None")
+  %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32
+  util.status.check_ok %status, "failed to wait on timepoint"
+  %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+  %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+  util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After IREECodegenAffineExpandIndexOpsPass (iree-codegen-affine-expand-index-ops) //----- //
+util.initializer {
+  %0 = util.null : !hal.executable
+  %c14_i32 = arith.constant 14 : i32
+  %c-1 = arith.constant -1 : index
+  %c-1_i64 = arith.constant -1 : i64
+  %c18_i32 = arith.constant 18 : i32
+  %false = arith.constant false
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %1 = util.null : !hal.device
+  %device_count = hal.devices.count : index
+  cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device)
+^bb1(%2: index, %3: index, %4: !hal.device):  // 2 preds: ^bb0, ^bb4
+  %5 = util.cmp.eq %4, %1 : !hal.device
+  %6 = arith.cmpi slt, %2, %device_count : index
+  %7 = arith.andi %5, %6 : i1
+  cf.cond_br %7, ^bb2, ^bb5
+^bb2:  // pred: ^bb1
+  %device_n = hal.devices.get %2 : !hal.device
+  %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
+  cf.cond_br %value, ^bb3, ^bb4(%false : i1)
+^bb3:  // pred: ^bb2
+  %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+  cf.br ^bb4(%value_1 : i1)
+^bb4(%8: i1):  // 2 preds: ^bb2, ^bb3
+  %9 = arith.cmpi eq, %3, %c0 : index
+  %10 = arith.select %8, %c1, %c0 : index
+  %11 = arith.addi %3, %10 : index
+  %12 = arith.andi %8, %9 : i1
+  %13 = arith.select %12, %device_n, %1 : !hal.device
+  %14 = arith.addi %2, %c1 : index
+  cf.br ^bb1(%14, %11, %13 : index, index, !hal.device)
+^bb5:  // pred: ^bb1
+  cf.cond_br %5, ^bb6, ^bb7
+^bb6:  // pred: ^bb5
+  util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>"
+  cf.br ^bb7
+^bb7:  // 2 preds: ^bb5, ^bb6
+  util.global.store %4, @__device_0 : !hal.device
+  cf.br ^bb8
+^bb8:  // pred: ^bb7
+  %__device_0 = util.global.load @__device_0 : !hal.device
+  %ok_2, %value_3 = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+  util.global.store %value_3, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  %__device_0_4 = util.global.load @__device_0 : !hal.device
+  %15 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index
+  %16 = arith.cmpi eq, %15, %c0 : index
+  cf.cond_br %16, ^bb9, ^bb10
+^bb9:  // pred: ^bb8
+  %executable = hal.executable.create device(%__device_0_4 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable
+  cf.br ^bb11(%executable : !hal.executable)
+^bb10:  // pred: ^bb8
+  util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+  cf.br ^bb11(%0 : !hal.executable)
+^bb11(%17: !hal.executable):  // 2 preds: ^bb9, ^bb10
+  util.global.store %17, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  cf.br ^bb12
+^bb12:  // pred: ^bb11
+  %18 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer
+  util.global.store %18, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  util.return
+}
+
+// -----// IR Dump After IREECodegenLowerAffinePass (iree-codegen-lower-affine) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c-1_i32 = arith.constant -1 : i32
+  %c0_i64 = arith.constant 0 : i64
+  %0 = util.null : !hal.fence
+  %c-1_i64 = arith.constant -1 : i64
+  %c0 = arith.constant 0 : index
+  %c128 = arith.constant 128 : index
+  %c64 = arith.constant 64 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32
+  %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
+  %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
+  hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
+  hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+  %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+  %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128}
+  %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+  hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%__multiple_results_memoize_result_0_device_0) bindings([
+    (%buffer : !hal.buffer)[%c0, %c8], 
+    (%buffer_0 : !hal.buffer)[%c0, %c8], 
+    (%transient_buffer : !hal.buffer)[%c0, %c128]
+  ]) flags("None")
+  %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32
+  util.status.check_ok %status, "failed to wait on timepoint"
+  %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+  %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+  util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After IREECodegenLowerAffinePass (iree-codegen-lower-affine) //----- //
+util.initializer {
+  %0 = util.null : !hal.executable
+  %c14_i32 = arith.constant 14 : i32
+  %c-1 = arith.constant -1 : index
+  %c-1_i64 = arith.constant -1 : i64
+  %c18_i32 = arith.constant 18 : i32
+  %false = arith.constant false
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %1 = util.null : !hal.device
+  %device_count = hal.devices.count : index
+  cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device)
+^bb1(%2: index, %3: index, %4: !hal.device):  // 2 preds: ^bb0, ^bb4
+  %5 = util.cmp.eq %4, %1 : !hal.device
+  %6 = arith.cmpi slt, %2, %device_count : index
+  %7 = arith.andi %5, %6 : i1
+  cf.cond_br %7, ^bb2, ^bb5
+^bb2:  // pred: ^bb1
+  %device_n = hal.devices.get %2 : !hal.device
+  %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
+  cf.cond_br %value, ^bb3, ^bb4(%false : i1)
+^bb3:  // pred: ^bb2
+  %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+  cf.br ^bb4(%value_1 : i1)
+^bb4(%8: i1):  // 2 preds: ^bb2, ^bb3
+  %9 = arith.cmpi eq, %3, %c0 : index
+  %10 = arith.select %8, %c1, %c0 : index
+  %11 = arith.addi %3, %10 : index
+  %12 = arith.andi %8, %9 : i1
+  %13 = arith.select %12, %device_n, %1 : !hal.device
+  %14 = arith.addi %2, %c1 : index
+  cf.br ^bb1(%14, %11, %13 : index, index, !hal.device)
+^bb5:  // pred: ^bb1
+  cf.cond_br %5, ^bb6, ^bb7
+^bb6:  // pred: ^bb5
+  util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>"
+  cf.br ^bb7
+^bb7:  // 2 preds: ^bb5, ^bb6
+  util.global.store %4, @__device_0 : !hal.device
+  cf.br ^bb8
+^bb8:  // pred: ^bb7
+  %__device_0 = util.global.load @__device_0 : !hal.device
+  %ok_2, %value_3 = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+  util.global.store %value_3, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  %__device_0_4 = util.global.load @__device_0 : !hal.device
+  %15 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index
+  %16 = arith.cmpi eq, %15, %c0 : index
+  cf.cond_br %16, ^bb9, ^bb10
+^bb9:  // pred: ^bb8
+  %executable = hal.executable.create device(%__device_0_4 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable
+  cf.br ^bb11(%executable : !hal.executable)
+^bb10:  // pred: ^bb8
+  util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+  cf.br ^bb11(%0 : !hal.executable)
+^bb11(%17: !hal.executable):  // 2 preds: ^bb9, ^bb10
+  util.global.store %17, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  cf.br ^bb12
+^bb12:  // pred: ^bb11
+  %18 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer
+  util.global.store %18, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  util.return
+}
+
+// -----// IR Dump After ArithUnsignedWhenEquivalentPass (arith-unsigned-when-equivalent) //----- //
+module {
+  util.global private @__device_0 : !hal.device
+  util.global private @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  util.global private @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  util.initializer {
+    %0 = util.null : !hal.executable
+    %c14_i32 = arith.constant 14 : i32
+    %c-1 = arith.constant -1 : index
+    %c-1_i64 = arith.constant -1 : i64
+    %c18_i32 = arith.constant 18 : i32
+    %false = arith.constant false
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %1 = util.null : !hal.device
+    %device_count = hal.devices.count : index
+    cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device)
+  ^bb1(%2: index, %3: index, %4: !hal.device):  // 2 preds: ^bb0, ^bb4
+    %5 = util.cmp.eq %4, %1 : !hal.device
+    %6 = arith.cmpi slt, %2, %device_count : index
+    %7 = arith.andi %5, %6 : i1
+    cf.cond_br %7, ^bb2, ^bb5
+  ^bb2:  // pred: ^bb1
+    %device_n = hal.devices.get %2 : !hal.device
+    %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
+    cf.cond_br %value, ^bb3, ^bb4(%false : i1)
+  ^bb3:  // pred: ^bb2
+    %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+    cf.br ^bb4(%value_1 : i1)
+  ^bb4(%8: i1):  // 2 preds: ^bb2, ^bb3
+    %9 = arith.cmpi eq, %3, %c0 : index
+    %10 = arith.select %8, %c1, %c0 : index
+    %11 = arith.addi %3, %10 : index
+    %12 = arith.andi %8, %9 : i1
+    %13 = arith.select %12, %device_n, %1 : !hal.device
+    %14 = arith.addi %2, %c1 : index
+    cf.br ^bb1(%14, %11, %13 : index, index, !hal.device)
+  ^bb5:  // pred: ^bb1
+    cf.cond_br %5, ^bb6, ^bb7
+  ^bb6:  // pred: ^bb5
+    util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>"
+    cf.br ^bb7
+  ^bb7:  // 2 preds: ^bb5, ^bb6
+    util.global.store %4, @__device_0 : !hal.device
+    cf.br ^bb8
+  ^bb8:  // pred: ^bb7
+    %__device_0 = util.global.load @__device_0 : !hal.device
+    %ok_2, %value_3 = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+    util.global.store %value_3, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+    %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+    %__device_0_4 = util.global.load @__device_0 : !hal.device
+    %15 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index
+    %16 = arith.cmpi eq, %15, %c0 : index
+    cf.cond_br %16, ^bb9, ^bb10
+  ^bb9:  // pred: ^bb8
+    %executable = hal.executable.create device(%__device_0_4 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable
+    cf.br ^bb11(%executable : !hal.executable)
+  ^bb10:  // pred: ^bb8
+    util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+    cf.br ^bb11(%0 : !hal.executable)
+  ^bb11(%17: !hal.executable):  // 2 preds: ^bb9, ^bb10
+    util.global.store %17, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+    cf.br ^bb12
+  ^bb12:  // pred: ^bb11
+    %18 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer
+    util.global.store %18, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    util.return
+  }
+  hal.executable private @multiple_results_dispatch_0 {
+    hal.executable.binary public @embedded_elf_arm_64 attributes {data = dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8>, format = "embedded-elf-arm_64", mime_type = "application/x-elf"}
+  }
+  util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} {
+    %c-1_i64 = arith.constant -1 : i64
+    %c3 = arith.constant 3 : index
+    %c0_i32 = arith.constant 0 : i32
+    %c0 = arith.constant 0 : index
+    %c2 = arith.constant 2 : index
+    %c8 = arith.constant 8 : index
+    %c128 = arith.constant 128 : index
+    %c64_i32 = arith.constant 64 : i32
+    %c1 = arith.constant 1 : index
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([
+      (%c0 : index)[%c0, %c8], 
+      (%c2 : index)[%c0, %c128]
+    ]) flags("None")
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([
+      (%c1 : index)[%c0, %c8], 
+      (%c2 : index)[%c0, %c128]
+    ]) flags("None")
+    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
+    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
+    util.return %cmd : !hal.command_buffer
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c-1_i32 = arith.constant -1 : i32
+    %c0_i64 = arith.constant 0 : i64
+    %0 = util.null : !hal.fence
+    %c-1_i64 = arith.constant -1 : i64
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c64 = arith.constant 64 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32
+    %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
+    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
+    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
+    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128}
+    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+    hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%__multiple_results_memoize_result_0_device_0) bindings([
+      (%buffer : !hal.buffer)[%c0, %c8], 
+      (%buffer_0 : !hal.buffer)[%c0, %c8], 
+      (%transient_buffer : !hal.buffer)[%c0, %c128]
+    ]) flags("None")
+    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32
+    util.status.check_ok %status, "failed to wait on timepoint"
+    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+    %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+    util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After PropagateSubrangesPass (iree-util-propagate-subranges) //----- //
+module {
+  util.global private @__device_0 : !hal.device
+  util.global private @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  util.global private @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  util.initializer {
+    %0 = util.null : !hal.executable
+    %c14_i32 = arith.constant 14 : i32
+    %c-1 = arith.constant -1 : index
+    %c-1_i64 = arith.constant -1 : i64
+    %c18_i32 = arith.constant 18 : i32
+    %false = arith.constant false
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %1 = util.null : !hal.device
+    %device_count = hal.devices.count : index
+    cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device)
+  ^bb1(%2: index, %3: index, %4: !hal.device):  // 2 preds: ^bb0, ^bb4
+    %5 = util.cmp.eq %4, %1 : !hal.device
+    %6 = arith.cmpi slt, %2, %device_count : index
+    %7 = arith.andi %5, %6 : i1
+    cf.cond_br %7, ^bb2, ^bb5
+  ^bb2:  // pred: ^bb1
+    %device_n = hal.devices.get %2 : !hal.device
+    %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
+    cf.cond_br %value, ^bb3, ^bb4(%false : i1)
+  ^bb3:  // pred: ^bb2
+    %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+    cf.br ^bb4(%value_1 : i1)
+  ^bb4(%8: i1):  // 2 preds: ^bb2, ^bb3
+    %9 = arith.cmpi eq, %3, %c0 : index
+    %10 = arith.select %8, %c1, %c0 : index
+    %11 = arith.addi %3, %10 : index
+    %12 = arith.andi %8, %9 : i1
+    %13 = arith.select %12, %device_n, %1 : !hal.device
+    %14 = arith.addi %2, %c1 : index
+    cf.br ^bb1(%14, %11, %13 : index, index, !hal.device)
+  ^bb5:  // pred: ^bb1
+    cf.cond_br %5, ^bb6, ^bb7
+  ^bb6:  // pred: ^bb5
+    util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>"
+    cf.br ^bb7
+  ^bb7:  // 2 preds: ^bb5, ^bb6
+    util.global.store %4, @__device_0 : !hal.device
+    cf.br ^bb8
+  ^bb8:  // pred: ^bb7
+    %__device_0 = util.global.load @__device_0 : !hal.device
+    %ok_2, %value_3 = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+    util.global.store %value_3, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+    %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+    %__device_0_4 = util.global.load @__device_0 : !hal.device
+    %15 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index
+    %16 = arith.cmpi eq, %15, %c0 : index
+    cf.cond_br %16, ^bb9, ^bb10
+  ^bb9:  // pred: ^bb8
+    %executable = hal.executable.create device(%__device_0_4 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable
+    cf.br ^bb11(%executable : !hal.executable)
+  ^bb10:  // pred: ^bb8
+    util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+    cf.br ^bb11(%0 : !hal.executable)
+  ^bb11(%17: !hal.executable):  // 2 preds: ^bb9, ^bb10
+    util.global.store %17, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+    cf.br ^bb12
+  ^bb12:  // pred: ^bb11
+    %18 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer
+    util.global.store %18, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    util.return
+  }
+  hal.executable private @multiple_results_dispatch_0 {
+    hal.executable.binary public @embedded_elf_arm_64 attributes {data = dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8>, format = "embedded-elf-arm_64", mime_type = "application/x-elf"}
+  }
+  util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} {
+    %c-1_i64 = arith.constant -1 : i64
+    %c3 = arith.constant 3 : index
+    %c0_i32 = arith.constant 0 : i32
+    %c0 = arith.constant 0 : index
+    %c2 = arith.constant 2 : index
+    %c8 = arith.constant 8 : index
+    %c128 = arith.constant 128 : index
+    %c64_i32 = arith.constant 64 : i32
+    %c1 = arith.constant 1 : index
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([
+      (%c0 : index)[%c0, %c8], 
+      (%c2 : index)[%c0, %c128]
+    ]) flags("None")
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([
+      (%c1 : index)[%c0, %c8], 
+      (%c2 : index)[%c0, %c128]
+    ]) flags("None")
+    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
+    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
+    util.return %cmd : !hal.command_buffer
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c-1_i32 = arith.constant -1 : i32
+    %c0_i64 = arith.constant 0 : i64
+    %0 = util.null : !hal.fence
+    %c-1_i64 = arith.constant -1 : i64
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c64 = arith.constant 64 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32
+    %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
+    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
+    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
+    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128}
+    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+    hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%__multiple_results_memoize_result_0_device_0) bindings([
+      (%buffer : !hal.buffer)[%c0, %c8], 
+      (%buffer_0 : !hal.buffer)[%c0, %c8], 
+      (%transient_buffer : !hal.buffer)[%c0, %c128]
+    ]) flags("None")
+    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32
+    util.status.check_ok %status, "failed to wait on timepoint"
+    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+    %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+    util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c-1_i32 = arith.constant -1 : i32
+  %c0_i64 = arith.constant 0 : i64
+  %0 = util.null : !hal.fence
+  %c-1_i64 = arith.constant -1 : i64
+  %c0 = arith.constant 0 : index
+  %c128 = arith.constant 128 : index
+  %c64 = arith.constant 64 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32
+  %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
+  %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
+  hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
+  hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+  %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+  %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128}
+  %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+  hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%__multiple_results_memoize_result_0_device_0) bindings([
+    (%buffer : !hal.buffer)[%c0, %c8], 
+    (%buffer_0 : !hal.buffer)[%c0, %c8], 
+    (%transient_buffer : !hal.buffer)[%c0, %c128]
+  ]) flags("None")
+  %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32
+  util.status.check_ok %status, "failed to wait on timepoint"
+  %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+  %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+  util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} {
+  %c-1_i64 = arith.constant -1 : i64
+  %c3 = arith.constant 3 : index
+  %c0_i32 = arith.constant 0 : i32
+  %c0 = arith.constant 0 : index
+  %c2 = arith.constant 2 : index
+  %c8 = arith.constant 8 : index
+  %c128 = arith.constant 128 : index
+  %c64_i32 = arith.constant 64 : i32
+  %c1 = arith.constant 1 : index
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer
+  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([
+    (%c0 : index)[%c0, %c8], 
+    (%c2 : index)[%c0, %c128]
+  ]) flags("None")
+  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([
+    (%c1 : index)[%c0, %c8], 
+    (%c2 : index)[%c0, %c128]
+  ]) flags("None")
+  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
+  hal.command_buffer.finalize<%cmd : !hal.command_buffer>
+  util.return %cmd : !hal.command_buffer
+}
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+util.initializer {
+  %0 = util.null : !hal.executable
+  %c14_i32 = arith.constant 14 : i32
+  %c-1 = arith.constant -1 : index
+  %c-1_i64 = arith.constant -1 : i64
+  %c18_i32 = arith.constant 18 : i32
+  %false = arith.constant false
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %1 = util.null : !hal.device
+  %device_count = hal.devices.count : index
+  cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device)
+^bb1(%2: index, %3: index, %4: !hal.device):  // 2 preds: ^bb0, ^bb4
+  %5 = util.cmp.eq %4, %1 : !hal.device
+  %6 = arith.cmpi slt, %2, %device_count : index
+  %7 = arith.andi %5, %6 : i1
+  cf.cond_br %7, ^bb2, ^bb5
+^bb2:  // pred: ^bb1
+  %device_n = hal.devices.get %2 : !hal.device
+  %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
+  cf.cond_br %value, ^bb3, ^bb4(%false : i1)
+^bb3:  // pred: ^bb2
+  %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+  cf.br ^bb4(%value_1 : i1)
+^bb4(%8: i1):  // 2 preds: ^bb2, ^bb3
+  %9 = arith.cmpi eq, %3, %c0 : index
+  %10 = arith.select %8, %c1, %c0 : index
+  %11 = arith.addi %3, %10 : index
+  %12 = arith.andi %8, %9 : i1
+  %13 = arith.select %12, %device_n, %1 : !hal.device
+  %14 = arith.addi %2, %c1 : index
+  cf.br ^bb1(%14, %11, %13 : index, index, !hal.device)
+^bb5:  // pred: ^bb1
+  cf.cond_br %5, ^bb6, ^bb7
+^bb6:  // pred: ^bb5
+  util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>"
+  cf.br ^bb7
+^bb7:  // 2 preds: ^bb5, ^bb6
+  util.global.store %4, @__device_0 : !hal.device
+  %__device_0 = util.global.load @__device_0 : !hal.device
+  %ok_2, %value_3 = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+  util.global.store %value_3, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  %__device_0_4 = util.global.load @__device_0 : !hal.device
+  %15 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index
+  %16 = arith.cmpi eq, %15, %c0 : index
+  cf.cond_br %16, ^bb8, ^bb9
+^bb8:  // pred: ^bb7
+  %executable = hal.executable.create device(%__device_0_4 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable
+  cf.br ^bb10(%executable : !hal.executable)
+^bb9:  // pred: ^bb7
+  util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+  cf.br ^bb10(%0 : !hal.executable)
+^bb10(%17: !hal.executable):  // 2 preds: ^bb8, ^bb9
+  util.global.store %17, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  %18 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer
+  util.global.store %18, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  util.return
+}
+
+// -----// IR Dump After CSE (cse) //----- //
+util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} {
+  %c-1_i64 = arith.constant -1 : i64
+  %c3 = arith.constant 3 : index
+  %c0_i32 = arith.constant 0 : i32
+  %c0 = arith.constant 0 : index
+  %c2 = arith.constant 2 : index
+  %c8 = arith.constant 8 : index
+  %c128 = arith.constant 128 : index
+  %c64_i32 = arith.constant 64 : i32
+  %c1 = arith.constant 1 : index
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer
+  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([
+    (%c0 : index)[%c0, %c8], 
+    (%c2 : index)[%c0, %c128]
+  ]) flags("None")
+  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([
+    (%c1 : index)[%c0, %c8], 
+    (%c2 : index)[%c0, %c128]
+  ]) flags("None")
+  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
+  hal.command_buffer.finalize<%cmd : !hal.command_buffer>
+  util.return %cmd : !hal.command_buffer
+}
+
+// -----// IR Dump After CSE (cse) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c-1_i32 = arith.constant -1 : i32
+  %c0_i64 = arith.constant 0 : i64
+  %0 = util.null : !hal.fence
+  %c-1_i64 = arith.constant -1 : i64
+  %c0 = arith.constant 0 : index
+  %c128 = arith.constant 128 : index
+  %c64 = arith.constant 64 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32
+  %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
+  %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
+  hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
+  hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+  %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+  %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128}
+  %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+  hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%__multiple_results_memoize_result_0_device_0) bindings([
+    (%buffer : !hal.buffer)[%c0, %c8], 
+    (%buffer_0 : !hal.buffer)[%c0, %c8], 
+    (%transient_buffer : !hal.buffer)[%c0, %c128]
+  ]) flags("None")
+  %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32
+  util.status.check_ok %status, "failed to wait on timepoint"
+  %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+  %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+  util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After CSE (cse) //----- //
+util.initializer {
+  %0 = util.null : !hal.executable
+  %c14_i32 = arith.constant 14 : i32
+  %c-1 = arith.constant -1 : index
+  %c-1_i64 = arith.constant -1 : i64
+  %c18_i32 = arith.constant 18 : i32
+  %false = arith.constant false
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %1 = util.null : !hal.device
+  %device_count = hal.devices.count : index
+  cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device)
+^bb1(%2: index, %3: index, %4: !hal.device):  // 2 preds: ^bb0, ^bb4
+  %5 = util.cmp.eq %4, %1 : !hal.device
+  %6 = arith.cmpi slt, %2, %device_count : index
+  %7 = arith.andi %5, %6 : i1
+  cf.cond_br %7, ^bb2, ^bb5
+^bb2:  // pred: ^bb1
+  %device_n = hal.devices.get %2 : !hal.device
+  %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
+  cf.cond_br %value, ^bb3, ^bb4(%false : i1)
+^bb3:  // pred: ^bb2
+  %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+  cf.br ^bb4(%value_1 : i1)
+^bb4(%8: i1):  // 2 preds: ^bb2, ^bb3
+  %9 = arith.cmpi eq, %3, %c0 : index
+  %10 = arith.select %8, %c1, %c0 : index
+  %11 = arith.addi %3, %10 : index
+  %12 = arith.andi %8, %9 : i1
+  %13 = arith.select %12, %device_n, %1 : !hal.device
+  %14 = arith.addi %2, %c1 : index
+  cf.br ^bb1(%14, %11, %13 : index, index, !hal.device)
+^bb5:  // pred: ^bb1
+  cf.cond_br %5, ^bb6, ^bb7
+^bb6:  // pred: ^bb5
+  util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>"
+  cf.br ^bb7
+^bb7:  // 2 preds: ^bb5, ^bb6
+  util.global.store %4, @__device_0 : !hal.device
+  %__device_0 = util.global.load @__device_0 : !hal.device
+  %ok_2, %value_3 = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+  util.global.store %value_3, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  %__device_0_4 = util.global.load @__device_0 : !hal.device
+  %15 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index
+  %16 = arith.cmpi eq, %15, %c0 : index
+  cf.cond_br %16, ^bb8, ^bb9
+^bb8:  // pred: ^bb7
+  %executable = hal.executable.create device(%__device_0_4 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable
+  cf.br ^bb10(%executable : !hal.executable)
+^bb9:  // pred: ^bb7
+  util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+  cf.br ^bb10(%0 : !hal.executable)
+^bb10(%17: !hal.executable):  // 2 preds: ^bb8, ^bb9
+  util.global.store %17, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  %18 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer
+  util.global.store %18, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  util.return
+}
+
+// -----// IR Dump After SymbolDCE (symbol-dce) //----- //
+module {
+  util.global private @__device_0 : !hal.device
+  util.global private @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  util.global private @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  util.initializer {
+    %0 = util.null : !hal.executable
+    %c14_i32 = arith.constant 14 : i32
+    %c-1 = arith.constant -1 : index
+    %c-1_i64 = arith.constant -1 : i64
+    %c18_i32 = arith.constant 18 : i32
+    %false = arith.constant false
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %1 = util.null : !hal.device
+    %device_count = hal.devices.count : index
+    cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device)
+  ^bb1(%2: index, %3: index, %4: !hal.device):  // 2 preds: ^bb0, ^bb4
+    %5 = util.cmp.eq %4, %1 : !hal.device
+    %6 = arith.cmpi slt, %2, %device_count : index
+    %7 = arith.andi %5, %6 : i1
+    cf.cond_br %7, ^bb2, ^bb5
+  ^bb2:  // pred: ^bb1
+    %device_n = hal.devices.get %2 : !hal.device
+    %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
+    cf.cond_br %value, ^bb3, ^bb4(%false : i1)
+  ^bb3:  // pred: ^bb2
+    %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+    cf.br ^bb4(%value_1 : i1)
+  ^bb4(%8: i1):  // 2 preds: ^bb2, ^bb3
+    %9 = arith.cmpi eq, %3, %c0 : index
+    %10 = arith.select %8, %c1, %c0 : index
+    %11 = arith.addi %3, %10 : index
+    %12 = arith.andi %8, %9 : i1
+    %13 = arith.select %12, %device_n, %1 : !hal.device
+    %14 = arith.addi %2, %c1 : index
+    cf.br ^bb1(%14, %11, %13 : index, index, !hal.device)
+  ^bb5:  // pred: ^bb1
+    cf.cond_br %5, ^bb6, ^bb7
+  ^bb6:  // pred: ^bb5
+    util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>"
+    cf.br ^bb7
+  ^bb7:  // 2 preds: ^bb5, ^bb6
+    util.global.store %4, @__device_0 : !hal.device
+    %__device_0 = util.global.load @__device_0 : !hal.device
+    %ok_2, %value_3 = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+    util.global.store %value_3, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+    %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+    %__device_0_4 = util.global.load @__device_0 : !hal.device
+    %15 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index
+    %16 = arith.cmpi eq, %15, %c0 : index
+    cf.cond_br %16, ^bb8, ^bb9
+  ^bb8:  // pred: ^bb7
+    %executable = hal.executable.create device(%__device_0_4 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable
+    cf.br ^bb10(%executable : !hal.executable)
+  ^bb9:  // pred: ^bb7
+    util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+    cf.br ^bb10(%0 : !hal.executable)
+  ^bb10(%17: !hal.executable):  // 2 preds: ^bb8, ^bb9
+    util.global.store %17, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+    %18 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer
+    util.global.store %18, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    util.return
+  }
+  hal.executable private @multiple_results_dispatch_0 {
+    hal.executable.binary public @embedded_elf_arm_64 attributes {data = dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8>, format = "embedded-elf-arm_64", mime_type = "application/x-elf"}
+  }
+  util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} {
+    %c-1_i64 = arith.constant -1 : i64
+    %c3 = arith.constant 3 : index
+    %c0_i32 = arith.constant 0 : i32
+    %c0 = arith.constant 0 : index
+    %c2 = arith.constant 2 : index
+    %c8 = arith.constant 8 : index
+    %c128 = arith.constant 128 : index
+    %c64_i32 = arith.constant 64 : i32
+    %c1 = arith.constant 1 : index
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([
+      (%c0 : index)[%c0, %c8], 
+      (%c2 : index)[%c0, %c128]
+    ]) flags("None")
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([
+      (%c1 : index)[%c0, %c8], 
+      (%c2 : index)[%c0, %c128]
+    ]) flags("None")
+    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
+    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
+    util.return %cmd : !hal.command_buffer
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c-1_i32 = arith.constant -1 : i32
+    %c0_i64 = arith.constant 0 : i64
+    %0 = util.null : !hal.fence
+    %c-1_i64 = arith.constant -1 : i64
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c64 = arith.constant 64 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32
+    %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
+    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
+    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
+    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128}
+    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+    hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%__multiple_results_memoize_result_0_device_0) bindings([
+      (%buffer : !hal.buffer)[%c0, %c8], 
+      (%buffer_0 : !hal.buffer)[%c0, %c8], 
+      (%transient_buffer : !hal.buffer)[%c0, %c128]
+    ]) flags("None")
+    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32
+    util.status.check_ok %status, "failed to wait on timepoint"
+    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+    %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+    util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- //
+util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} {
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  %c-1_i64 = arith.constant -1 : i64
+  %c3 = arith.constant 3 : index
+  %c0_i32 = arith.constant 0 : i32
+  %c0 = arith.constant 0 : index
+  %c2 = arith.constant 2 : index
+  %c8 = arith.constant 8 : index
+  %c128 = arith.constant 128 : index
+  %c64_i32 = arith.constant 64 : i32
+  %c1 = arith.constant 1 : index
+  %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer
+  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([
+    (%c0 : index)[%c0, %c8], 
+    (%c2 : index)[%c0, %c128]
+  ]) flags("None")
+  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([
+    (%c1 : index)[%c0, %c8], 
+    (%c2 : index)[%c0, %c128]
+  ]) flags("None")
+  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
+  hal.command_buffer.finalize<%cmd : !hal.command_buffer>
+  util.return %cmd : !hal.command_buffer
+}
+
+// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- //
+util.initializer {
+  %0 = util.null : !hal.executable
+  %c14_i32 = arith.constant 14 : i32
+  %c-1 = arith.constant -1 : index
+  %c-1_i64 = arith.constant -1 : i64
+  %c18_i32 = arith.constant 18 : i32
+  %false = arith.constant false
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %1 = util.null : !hal.device
+  %device_count = hal.devices.count : index
+  cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device)
+^bb1(%2: index, %3: index, %4: !hal.device):  // 2 preds: ^bb0, ^bb4
+  %5 = util.cmp.eq %4, %1 : !hal.device
+  %6 = arith.cmpi slt, %2, %device_count : index
+  %7 = arith.andi %5, %6 : i1
+  cf.cond_br %7, ^bb2, ^bb5
+^bb2:  // pred: ^bb1
+  %device_n = hal.devices.get %2 : !hal.device
+  %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
+  cf.cond_br %value, ^bb3, ^bb4(%false : i1)
+^bb3:  // pred: ^bb2
+  %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+  cf.br ^bb4(%value_1 : i1)
+^bb4(%8: i1):  // 2 preds: ^bb2, ^bb3
+  %9 = arith.cmpi eq, %3, %c0 : index
+  %10 = arith.select %8, %c1, %c0 : index
+  %11 = arith.addi %3, %10 : index
+  %12 = arith.andi %8, %9 : i1
+  %13 = arith.select %12, %device_n, %1 : !hal.device
+  %14 = arith.addi %2, %c1 : index
+  cf.br ^bb1(%14, %11, %13 : index, index, !hal.device)
+^bb5:  // pred: ^bb1
+  cf.cond_br %5, ^bb6, ^bb7
+^bb6:  // pred: ^bb5
+  util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>"
+  cf.br ^bb7
+^bb7:  // 2 preds: ^bb5, ^bb6
+  %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+  %15 = arith.select %value_3, %c0, %c-1 : index
+  %16 = arith.cmpi eq, %15, %c0 : index
+  util.global.store %4, @__device_0 : !hal.device
+  util.global.store %value_3, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  cf.cond_br %16, ^bb8, ^bb9
+^bb8:  // pred: ^bb7
+  %executable = hal.executable.create device(%4 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable
+  cf.br ^bb10(%executable : !hal.executable)
+^bb9:  // pred: ^bb7
+  util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+  cf.br ^bb10(%0 : !hal.executable)
+^bb10(%17: !hal.executable):  // 2 preds: ^bb8, ^bb9
+  util.global.store %17, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  %18 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer
+  util.global.store %18, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  util.return
+}
+
+// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  %c-1_i32 = arith.constant -1 : i32
+  %c0_i64 = arith.constant 0 : i64
+  %0 = util.null : !hal.fence
+  %c-1_i64 = arith.constant -1 : i64
+  %c0 = arith.constant 0 : index
+  %c128 = arith.constant 128 : index
+  %c64 = arith.constant 64 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32
+  %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
+  %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
+  hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
+  hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+  %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+  %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128}
+  %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+  hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%__multiple_results_memoize_result_0_device_0) bindings([
+    (%buffer : !hal.buffer)[%c0, %c8], 
+    (%buffer_0 : !hal.buffer)[%c0, %c8], 
+    (%transient_buffer : !hal.buffer)[%c0, %c128]
+  ]) flags("None")
+  %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32
+  util.status.check_ok %status, "failed to wait on timepoint"
+  %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+  %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+  util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- //
+util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} {
+  %c1 = arith.constant 1 : index
+  %c64_i32 = arith.constant 64 : i32
+  %c128 = arith.constant 128 : index
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %c0 = arith.constant 0 : index
+  %c0_i32 = arith.constant 0 : i32
+  %c3 = arith.constant 3 : index
+  %c-1_i64 = arith.constant -1 : i64
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer
+  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([
+    (%c0 : index)[%c0, %c8], 
+    (%c2 : index)[%c0, %c128]
+  ]) flags("None")
+  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([
+    (%c1 : index)[%c0, %c8], 
+    (%c2 : index)[%c0, %c128]
+  ]) flags("None")
+  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
+  hal.command_buffer.finalize<%cmd : !hal.command_buffer>
+  util.return %cmd : !hal.command_buffer
+}
+
+// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- //
+util.initializer {
+  %0 = util.null : !hal.executable
+  %c14_i32 = arith.constant 14 : i32
+  %c-1 = arith.constant -1 : index
+  %c-1_i64 = arith.constant -1 : i64
+  %c18_i32 = arith.constant 18 : i32
+  %false = arith.constant false
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %1 = util.null : !hal.device
+  %device_count = hal.devices.count : index
+  cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device)
+^bb1(%2: index, %3: index, %4: !hal.device):  // 2 preds: ^bb0, ^bb4
+  %5 = util.cmp.eq %4, %1 : !hal.device
+  %6 = arith.cmpi slt, %2, %device_count : index
+  %7 = arith.andi %5, %6 : i1
+  cf.cond_br %7, ^bb2, ^bb5
+^bb2:  // pred: ^bb1
+  %device_n = hal.devices.get %2 : !hal.device
+  %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
+  cf.cond_br %value, ^bb3, ^bb4(%false : i1)
+^bb3:  // pred: ^bb2
+  %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+  cf.br ^bb4(%value_1 : i1)
+^bb4(%8: i1):  // 2 preds: ^bb2, ^bb3
+  %9 = arith.cmpi eq, %3, %c0 : index
+  %10 = arith.select %8, %c1, %c0 : index
+  %11 = arith.addi %3, %10 : index
+  %12 = arith.andi %8, %9 : i1
+  %13 = arith.select %12, %device_n, %1 : !hal.device
+  %14 = arith.addi %2, %c1 : index
+  cf.br ^bb1(%14, %11, %13 : index, index, !hal.device)
+^bb5:  // pred: ^bb1
+  cf.cond_br %5, ^bb6, ^bb7
+^bb6:  // pred: ^bb5
+  util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>"
+  cf.br ^bb7
+^bb7:  // 2 preds: ^bb5, ^bb6
+  %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+  %15 = arith.select %value_3, %c0, %c-1 : index
+  %16 = arith.cmpi eq, %15, %c0 : index
+  util.global.store %4, @__device_0 : !hal.device
+  util.global.store %value_3, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1
+  cf.cond_br %16, ^bb8, ^bb9
+^bb8:  // pred: ^bb7
+  %executable = hal.executable.create device(%4 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable
+  cf.br ^bb10(%executable : !hal.executable)
+^bb9:  // pred: ^bb7
+  util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+  cf.br ^bb10(%0 : !hal.executable)
+^bb10(%17: !hal.executable):  // 2 preds: ^bb8, ^bb9
+  util.global.store %17, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  %18 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer
+  util.global.store %18, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  util.return
+}
+
+// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- //
+util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32
+  %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32
+  %c2 = arith.constant 2 : index
+  %c8 = arith.constant 8 : index
+  %c64 = arith.constant 64 : index
+  %c128 = arith.constant 128 : index
+  %c0 = arith.constant 0 : index
+  %c-1_i64 = arith.constant -1 : i64
+  %0 = util.null : !hal.fence
+  %c0_i64 = arith.constant 0 : i64
+  %c-1_i32 = arith.constant -1 : i32
+  %__device_0 = util.global.load immutable @__device_0 : !hal.device
+  %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  %element_type_f32 = hal.element_type<f32> : i32
+  %dense_row_major = hal.encoding_type<dense_row_major> : i32
+  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
+  %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
+  hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+  %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
+  hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+  %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+  %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128}
+  %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+  hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%__multiple_results_memoize_result_0_device_0) bindings([
+    (%buffer : !hal.buffer)[%c0, %c8], 
+    (%buffer_0 : !hal.buffer)[%c0, %c8], 
+    (%transient_buffer : !hal.buffer)[%c0, %c128]
+  ]) flags("None")
+  %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32
+  util.status.check_ok %status, "failed to wait on timepoint"
+  %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+  %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+  util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view
+}
+
+// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- //
+module {
+  util.global private @__device_0 : !hal.device
+  util.global private @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  util.initializer {
+    %0 = util.null : !hal.executable
+    %c14_i32 = arith.constant 14 : i32
+    %c-1 = arith.constant -1 : index
+    %c-1_i64 = arith.constant -1 : i64
+    %c18_i32 = arith.constant 18 : i32
+    %false = arith.constant false
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %1 = util.null : !hal.device
+    %device_count = hal.devices.count : index
+    cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device)
+  ^bb1(%2: index, %3: index, %4: !hal.device):  // 2 preds: ^bb0, ^bb4
+    %5 = util.cmp.eq %4, %1 : !hal.device
+    %6 = arith.cmpi slt, %2, %device_count : index
+    %7 = arith.andi %5, %6 : i1
+    cf.cond_br %7, ^bb2, ^bb5
+  ^bb2:  // pred: ^bb1
+    %device_n = hal.devices.get %2 : !hal.device
+    %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
+    cf.cond_br %value, ^bb3, ^bb4(%false : i1)
+  ^bb3:  // pred: ^bb2
+    %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+    cf.br ^bb4(%value_1 : i1)
+  ^bb4(%8: i1):  // 2 preds: ^bb2, ^bb3
+    %9 = arith.cmpi eq, %3, %c0 : index
+    %10 = arith.select %8, %c1, %c0 : index
+    %11 = arith.addi %3, %10 : index
+    %12 = arith.andi %8, %9 : i1
+    %13 = arith.select %12, %device_n, %1 : !hal.device
+    %14 = arith.addi %2, %c1 : index
+    cf.br ^bb1(%14, %11, %13 : index, index, !hal.device)
+  ^bb5:  // pred: ^bb1
+    cf.cond_br %5, ^bb6, ^bb7
+  ^bb6:  // pred: ^bb5
+    util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>"
+    cf.br ^bb7
+  ^bb7:  // 2 preds: ^bb5, ^bb6
+    %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+    %15 = arith.select %value_3, %c0, %c-1 : index
+    %16 = arith.cmpi eq, %15, %c0 : index
+    util.global.store %4, @__device_0 : !hal.device
+    cf.cond_br %16, ^bb8, ^bb9
+  ^bb8:  // pred: ^bb7
+    %executable = hal.executable.create device(%4 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable
+    cf.br ^bb10(%executable : !hal.executable)
+  ^bb9:  // pred: ^bb7
+    util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+    cf.br ^bb10(%0 : !hal.executable)
+  ^bb10(%17: !hal.executable):  // 2 preds: ^bb8, ^bb9
+    util.global.store %17, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+    %18 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer
+    util.global.store %18, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    util.return
+  }
+  hal.executable private @multiple_results_dispatch_0 {
+    hal.executable.binary public @embedded_elf_arm_64 attributes {data = dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8>, format = "embedded-elf-arm_64", mime_type = "application/x-elf"}
+  }
+  util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} {
+    %c1 = arith.constant 1 : index
+    %c64_i32 = arith.constant 64 : i32
+    %c128 = arith.constant 128 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %c0 = arith.constant 0 : index
+    %c0_i32 = arith.constant 0 : i32
+    %c3 = arith.constant 3 : index
+    %c-1_i64 = arith.constant -1 : i64
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([
+      (%c0 : index)[%c0, %c8], 
+      (%c2 : index)[%c0, %c128]
+    ]) flags("None")
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([
+      (%c1 : index)[%c0, %c8], 
+      (%c2 : index)[%c0, %c128]
+    ]) flags("None")
+    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
+    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
+    util.return %cmd : !hal.command_buffer
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32
+    %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32
+    %c2 = arith.constant 2 : index
+    %c8 = arith.constant 8 : index
+    %c64 = arith.constant 64 : index
+    %c128 = arith.constant 128 : index
+    %c0 = arith.constant 0 : index
+    %c-1_i64 = arith.constant -1 : i64
+    %0 = util.null : !hal.fence
+    %c0_i64 = arith.constant 0 : i64
+    %c-1_i32 = arith.constant -1 : i32
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
+    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
+    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
+    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128}
+    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+    hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%__multiple_results_memoize_result_0_device_0) bindings([
+      (%buffer : !hal.buffer)[%c0, %c8], 
+      (%buffer_0 : !hal.buffer)[%c0, %c8], 
+      (%transient_buffer : !hal.buffer)[%c0, %c128]
+    ]) flags("None")
+    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32
+    util.status.check_ok %status, "failed to wait on timepoint"
+    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+    %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+    util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- //
+module {
+  util.global private @__device_0 : !hal.device
+  util.global private @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+  util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+  util.initializer {
+    %0 = util.null : !hal.executable
+    %c14_i32 = arith.constant 14 : i32
+    %c-1 = arith.constant -1 : index
+    %c-1_i64 = arith.constant -1 : i64
+    %c18_i32 = arith.constant 18 : i32
+    %false = arith.constant false
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %1 = util.null : !hal.device
+    %device_count = hal.devices.count : index
+    cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device)
+  ^bb1(%2: index, %3: index, %4: !hal.device):  // 2 preds: ^bb0, ^bb4
+    %5 = util.cmp.eq %4, %1 : !hal.device
+    %6 = arith.cmpi slt, %2, %device_count : index
+    %7 = arith.andi %5, %6 : i1
+    cf.cond_br %7, ^bb2, ^bb5
+  ^bb2:  // pred: ^bb1
+    %device_n = hal.devices.get %2 : !hal.device
+    %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false
+    cf.cond_br %value, ^bb3, ^bb4(%false : i1)
+  ^bb3:  // pred: ^bb2
+    %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+    cf.br ^bb4(%value_1 : i1)
+  ^bb4(%8: i1):  // 2 preds: ^bb2, ^bb3
+    %9 = arith.cmpi eq, %3, %c0 : index
+    %10 = arith.select %8, %c1, %c0 : index
+    %11 = arith.addi %3, %10 : index
+    %12 = arith.andi %8, %9 : i1
+    %13 = arith.select %12, %device_n, %1 : !hal.device
+    %14 = arith.addi %2, %c1 : index
+    cf.br ^bb1(%14, %11, %13 : index, index, !hal.device)
+  ^bb5:  // pred: ^bb1
+    cf.cond_br %5, ^bb6, ^bb7
+  ^bb6:  // pred: ^bb5
+    util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>"
+    cf.br ^bb7
+  ^bb7:  // 2 preds: ^bb5, ^bb6
+    %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false
+    %15 = arith.select %value_3, %c0, %c-1 : index
+    %16 = arith.cmpi eq, %15, %c0 : index
+    util.global.store %4, @__device_0 : !hal.device
+    cf.cond_br %16, ^bb8, ^bb9
+  ^bb8:  // pred: ^bb7
+    %executable = hal.executable.create device(%4 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable
+    cf.br ^bb10(%executable : !hal.executable)
+  ^bb9:  // pred: ^bb7
+    util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+    cf.br ^bb10(%0 : !hal.executable)
+  ^bb10(%17: !hal.executable):  // 2 preds: ^bb8, ^bb9
+    util.global.store %17, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+    %18 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer
+    util.global.store %18, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    util.return
+  }
+  hal.executable private @multiple_results_dispatch_0 {
+    hal.executable.binary public @embedded_elf_arm_64 attributes {data = dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8>, format = "embedded-elf-arm_64", mime_type = "application/x-elf"}
+  }
+  util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} {
+    %c1 = arith.constant 1 : index
+    %c64_i32 = arith.constant 64 : i32
+    %c128 = arith.constant 128 : index
+    %c8 = arith.constant 8 : index
+    %c2 = arith.constant 2 : index
+    %c0 = arith.constant 0 : index
+    %c0_i32 = arith.constant 0 : i32
+    %c3 = arith.constant 3 : index
+    %c-1_i64 = arith.constant -1 : i64
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable
+    %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([
+      (%c0 : index)[%c0, %c8], 
+      (%c2 : index)[%c0, %c128]
+    ]) flags("None")
+    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([
+      (%c1 : index)[%c0, %c8], 
+      (%c2 : index)[%c0, %c128]
+    ]) flags("None")
+    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
+    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
+    util.return %cmd : !hal.command_buffer
+  }
+  util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32
+    %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32
+    %c2 = arith.constant 2 : index
+    %c8 = arith.constant 8 : index
+    %c64 = arith.constant 64 : index
+    %c128 = arith.constant 128 : index
+    %c0 = arith.constant 0 : index
+    %c-1_i64 = arith.constant -1 : i64
+    %0 = util.null : !hal.fence
+    %c0_i64 = arith.constant 0 : i64
+    %c-1_i32 = arith.constant -1 : i32
+    %__device_0 = util.global.load immutable @__device_0 : !hal.device
+    %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
+    %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator
+    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major)
+    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
+    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
+    %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+    %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128}
+    %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence
+    hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%__multiple_results_memoize_result_0_device_0) bindings([
+      (%buffer : !hal.buffer)[%c0, %c8], 
+      (%buffer_0 : !hal.buffer)[%c0, %c8], 
+      (%transient_buffer : !hal.buffer)[%c0, %c128]
+    ]) flags("None")
+    %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32
+    util.status.check_ok %status, "failed to wait on timepoint"
+    %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+    %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view
+    util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view
+  }
+}
+
+
+// -----// IR Dump After ConversionPass (iree-vm-conversion) //----- //
+module attributes {vm.toplevel} {
+  vm.module public @module {
+    vm.global.ref private @__device_0 : !vm.ref<!hal.device>
+    vm.global.ref private @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref<!hal.executable>
+    vm.global.ref private @__multiple_results_memoize_result_0_device_0 : !vm.ref<!hal.command_buffer>
+    vm.initializer {
+      %null = vm.const.ref.zero : !vm.ref<!hal.executable>
+      %c14 = vm.const.i32 14
+      %c-1 = vm.const.i64 -1
+      %c-1_0 = vm.const.i64 -1
+      %c18 = vm.const.i32 18
+      %zero = vm.const.i32.zero
+      %zero_1 = vm.const.i64.zero
+      %c1 = vm.const.i64 1
+      %null_2 = vm.const.ref.zero : !vm.ref<!hal.device>
+      %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
+      %1 = vm.ext.i32.i64.s %0 : i32 -> i64
+      vm.br ^bb1(%zero_1, %zero_1, %null_2 : i64, i64, !vm.ref<!hal.device>)
+    ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
+      %req = vm.cmp.eq.ref %4, %null_2 : !vm.ref<!hal.device>
+      %slt = vm.cmp.lt.i64.s %2, %1 : i64
+      %5 = vm.and.i32 %req, %slt : i32
+      vm.cond_br %5, ^bb2, ^bb5
+    ^bb2:  // pred: ^bb1
+      %6 = vm.trunc.i64.i32 %2 : i64 -> i32
+      %ref = vm.call @hal.devices.get(%6) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
+      %buffer = vm.rodata.inline "_utf8_hal_device_id_C6650FF277232B5A" {alignment = 1 : i64} : !vm.buffer = "hal.device.id"
+      %buffer_3 = vm.rodata.inline "_utf8_local_1A8FF0278D7661D8" {alignment = 1 : i64} : !vm.buffer = "local*"
+      %7:2 = vm.call @hal.device.query.i64(%ref, %buffer, %buffer_3) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
+      %nz = vm.cmp.nz.i64 %7#1 : i64
+      %zero_4 = vm.const.i32.zero
+      %8 = vm.select.i32 %7#0, %nz, %zero_4 : i32
+      %c1_5 = vm.const.i32 1
+      vm.cond_br %8, ^bb3, ^bb4(%zero : i32)
+    ^bb3:  // pred: ^bb2
+      %buffer_6 = vm.rodata.inline "_utf8_hal_executable_format_E03EECB63A2AAF52" {alignment = 1 : i64} : !vm.buffer = "hal.executable.format"
+      %buffer_7 = vm.rodata.inline "_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5" {alignment = 1 : i64} : !vm.buffer = "embedded-elf-arm_64"
+      %9:2 = vm.call @hal.device.query.i64(%ref, %buffer_6, %buffer_7) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
+      %nz_8 = vm.cmp.nz.i64 %9#1 : i64
+      %zero_9 = vm.const.i32.zero
+      %10 = vm.select.i32 %9#0, %nz_8, %zero_9 : i32
+      %c1_10 = vm.const.i32 1
+      vm.br ^bb4(%10 : i32)
+    ^bb4(%11: i32):  // 2 preds: ^bb2, ^bb3
+      %eq = vm.cmp.eq.i64 %3, %zero_1 : i64
+      %12 = vm.select.i64 %11, %c1, %zero_1 : i64
+      %13 = vm.add.i64 %3, %12 : i64
+      %14 = vm.and.i32 %11, %eq : i32
+      %ref_11 = vm.select.ref %14, %ref, %null_2 : !vm.ref<!hal.device>
+      %15 = vm.add.i64 %2, %c1 : i64
+      vm.br ^bb1(%15, %13, %ref_11 : i64, i64, !vm.ref<!hal.device>)
+    ^bb5:  // pred: ^bb1
+      vm.cond_br %req, ^bb6, ^bb7
+    ^bb6:  // pred: ^bb5
+      vm.cond_fail %c18, "HAL device `__device_0` not found or unavailable: #hal.device.target<"local", [#hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>]>"
+      vm.br ^bb7
+    ^bb7:  // 2 preds: ^bb5, ^bb6
+      %buffer_12 = vm.rodata.inline "_utf8_hal_executable_format_E03EECB63A2AAF52" {alignment = 1 : i64} : !vm.buffer = "hal.executable.format"
+      %buffer_13 = vm.rodata.inline "_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5" {alignment = 1 : i64} : !vm.buffer = "embedded-elf-arm_64"
+      %16:2 = vm.call @hal.device.query.i64(%4, %buffer_12, %buffer_13) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
+      %nz_14 = vm.cmp.nz.i64 %16#1 : i64
+      %zero_15 = vm.const.i32.zero
+      %17 = vm.select.i32 %16#0, %nz_14, %zero_15 : i32
+      %c1_16 = vm.const.i32 1
+      %18 = vm.select.i64 %17, %zero_1, %c-1 : i64
+      %eq_17 = vm.cmp.eq.i64 %18, %zero_1 : i64
+      vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
+      vm.cond_br %eq_17, ^bb8, ^bb9
+    ^bb8:  // pred: ^bb7
+      %buffer_18 = vm.rodata.inline "multiple_results_dispatch_0_embedded_elf_arm_64" {alignment = 16 : i64, mime_type = "application/x-elf"} : !vm.buffer = dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8>
+      %buffer_19 = vm.rodata.inline "_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5" {alignment = 1 : i64} : !vm.buffer = "embedded-elf-arm_64"
+      %null_20 = vm.const.ref.zero : !vm.buffer
+      %ref_21 = vm.call @hal.executable.create(%4, %c-1_0, %buffer_19, %buffer_18, %null_20) {nosideeffects} : (!vm.ref<!hal.device>, i64, !vm.buffer, !vm.buffer, !vm.buffer) -> !vm.ref<!hal.executable>
+      vm.br ^bb10(%ref_21 : !vm.ref<!hal.executable>)
+    ^bb9:  // pred: ^bb7
+      vm.cond_fail %c14, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+      vm.br ^bb10(%null : !vm.ref<!hal.executable>)
+    ^bb10(%19: !vm.ref<!hal.executable>):  // 2 preds: ^bb8, ^bb9
+      vm.global.store.ref %19, @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref<!hal.executable>
+      %ref_22 = vm.call @__multiple_results_memoize_apply() : () -> !vm.ref<!hal.command_buffer>
+      vm.global.store.ref %ref_22, @__multiple_results_memoize_result_0_device_0 : !vm.ref<!hal.command_buffer>
+      vm.return
+    }
+    vm.func private @__multiple_results_memoize_apply() -> !vm.ref<!hal.command_buffer> attributes {inlining_policy = #util.inline.never} {
+      %c1 = vm.const.i64 1
+      %c64 = vm.const.i32 64
+      %c128 = vm.const.i64 128
+      %c8 = vm.const.i64 8
+      %c2 = vm.const.i64 2
+      %zero = vm.const.i64.zero
+      %zero_0 = vm.const.i32.zero
+      %c3 = vm.const.i64 3
+      %c-1 = vm.const.i64 -1
+      %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref<!hal.device>
+      %__device_0_executable_0_multiple_results_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref<!hal.executable>
+      %zero_1 = vm.const.i32.zero
+      %c3_2 = vm.const.i32 3
+      %c3_3 = vm.const.i32 3
+      %ref = vm.call @hal.command_buffer.create(%__device_0, %zero_1, %c3_2, %c-1, %c3_3) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
+      %zero_4 = vm.const.i32.zero
+      %zero_5 = vm.const.i32.zero
+      %c1_6 = vm.const.i32 1
+      %c1_7 = vm.const.i32 1
+      %c1_8 = vm.const.i32 1
+      %zero_9 = vm.const.i64 0
+      %zero_10 = vm.const.i32.zero
+      %null = vm.const.ref.zero : !vm.ref<!hal.buffer>
+      %c2_11 = vm.const.i32 2
+      %null_12 = vm.const.ref.zero : !vm.ref<!hal.buffer>
+      vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_5, %c1_6, %c1_7, %c1_8, %zero_9, [%zero_0], [(%zero_4, %zero_10, %null, %zero, %c8), (%zero_4, %c2_11, %null_12, %zero, %c128)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64, i32 ..., tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
+      %zero_13 = vm.const.i32.zero
+      %zero_14 = vm.const.i32.zero
+      %c1_15 = vm.const.i32 1
+      %c1_16 = vm.const.i32 1
+      %c1_17 = vm.const.i32 1
+      %zero_18 = vm.const.i64 0
+      %c1_19 = vm.const.i32 1
+      %null_20 = vm.const.ref.zero : !vm.ref<!hal.buffer>
+      %c2_21 = vm.const.i32 2
+      %null_22 = vm.const.ref.zero : !vm.ref<!hal.buffer>
+      vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_14, %c1_15, %c1_16, %c1_17, %zero_18, [%c64], [(%zero_13, %c1_19, %null_20, %zero, %c8), (%zero_13, %c2_21, %null_22, %zero, %c128)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64, i32 ..., tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
+      %c28 = vm.const.i32 28
+      %c13 = vm.const.i32 13
+      %zero_23 = vm.const.i64.zero
+      vm.call @hal.command_buffer.execution_barrier(%ref, %c28, %c13, %zero_23) : (!vm.ref<!hal.command_buffer>, i32, i32, i64) -> ()
+      vm.call @hal.command_buffer.finalize(%ref) : (!vm.ref<!hal.command_buffer>) -> ()
+      vm.return %ref : !vm.ref<!hal.command_buffer>
+    }
+    vm.import private @hal.ex.file.from_memory(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %access : i32, %buffer : !vm.buffer, %offset : i64, %length : i64, %flags : i32) -> !vm.ref<!hal.file>
+    vm.import private @hal.allocator.select(%memory_types : i32, %buffer_usage : i32, %flags : i64, %from : tuple<!vm.ref<!hal.device>, i64> ...) -> (!vm.ref<!hal.device>, i64) attributes {nosideeffects}
+    vm.import private @hal.allocator.allocate(%allocator : !vm.ref<!hal.allocator>, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
+    vm.import private @hal.allocator.import(%allocator : !vm.ref<!hal.allocator>, %try : i32, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %source : !vm.buffer, %offset : i64, %length : i64) -> !vm.ref<!hal.buffer>
+    vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32)
+    vm.import private @hal.buffer.allocation.preserve(%buffer : !vm.ref<!hal.buffer>)
+    vm.import private @hal.buffer.allocation.discard(%buffer : !vm.ref<!hal.buffer>) -> i32
+    vm.import private @hal.buffer.allocation.is_terminal(%buffer : !vm.ref<!hal.buffer>) -> i32
+    vm.import private @hal.buffer.subspan(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %length : i64) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
+    vm.import private @hal.buffer.length(%buffer : !vm.ref<!hal.buffer>) -> i64 attributes {nosideeffects}
+    vm.import private @hal.buffer.load(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %length : i32) -> i32
+    vm.import private @hal.buffer.store(%value : i32, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i32)
+    vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects}
+    vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...)
+    vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
+    vm.import private @hal.buffer_view.element_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects}
+    vm.import private @hal.buffer_view.encoding_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects}
+    vm.import private @hal.buffer_view.rank(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects}
+    vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i64 attributes {nosideeffects}
+    vm.import private @hal.buffer_view.trace(%key : !vm.buffer, %operands : !vm.ref<!hal.buffer_view> ...)
+    vm.import private @hal.channel.create(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %flags : i64, %id : !vm.buffer, %group : !vm.buffer, %rank : i32, %count : i32) -> !vm.ref<!hal.channel> attributes {nosideeffects}
+    vm.import private @hal.channel.split(%channel : !vm.ref<!hal.channel>, %color : i32, %key : i32, %flags : i64) -> !vm.ref<!hal.channel> attributes {nosideeffects}
+    vm.import private @hal.channel.rank_and_count(%channel : !vm.ref<!hal.channel>) -> (i32, i32) attributes {nosideeffects}
+    vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 6 : i32}
+    vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>)
+    vm.import private @hal.command_buffer.begin_debug_group(%command_buffer : !vm.ref<!hal.command_buffer>, %label : !vm.buffer)
+    vm.import private @hal.command_buffer.end_debug_group(%command_buffer : !vm.ref<!hal.command_buffer>)
+    vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i64)
+    vm.import private @hal.command_buffer.advise_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %buffer : !vm.ref<!hal.buffer>, %flags : i64, %arg0 : i64, %arg1 : i64, %buffer_slot : i32)
+    vm.import private @hal.command_buffer.fill_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %target_buffer_slot : i32, %pattern : i64, %pattern_length : i32, %flags : i64)
+    vm.import private @hal.command_buffer.update_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %source_buffer : !vm.buffer, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %target_buffer_slot : i32, %flags : i64)
+    vm.import private @hal.command_buffer.copy_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %source_buffer_slot : i32, %target_buffer_slot : i32, %source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %flags : i64)
+    vm.import private @hal.command_buffer.collective(%command_buffer : !vm.ref<!hal.command_buffer>, %channel : !vm.ref<!hal.channel>, %op : i32, %param : i32, %send_buffer_slot : i32, %recv_buffer_slot : i32, %send_buffer : !vm.ref<!hal.buffer>, %recv_buffer : !vm.ref<!hal.buffer>, %send_offset : i64, %send_length : i64, %recv_offset : i64, %recv_length : i64, %element_count : i64)
+    vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
+    vm.import private @hal.command_buffer.dispatch.indirect(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref<!hal.buffer>, %workgroups_offset : i64, %flags : i64, %constants : i32 ..., %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
+    vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects}
+    vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects}
+    vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64, %flags : i64) -> !vm.ref<!hal.buffer>
+    vm.import private @hal.device.queue.dealloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %buffer : !vm.ref<!hal.buffer>, %flags : i64)
+    vm.import private @hal.device.queue.fill(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %pattern : i64, %pattern_length : i32, %flags : i64)
+    vm.import private @hal.device.queue.update(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %source_buffer : !vm.buffer, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %flags : i64)
+    vm.import private @hal.device.queue.copy(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %flags : i64)
+    vm.import private @hal.device.queue.read(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %source_file : !vm.ref<!hal.file>, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %flags : i64)
+    vm.import private @hal.device.queue.write(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %target_file : !vm.ref<!hal.file>, %target_offset : i64, %length : i64, %flags : i64)
+    vm.import private @hal.device.queue.barrier(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %flags : i64)
+    vm.import private @hal.device.queue.execute(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffer : !vm.ref<!hal.command_buffer>, %flags : i64)
+    vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffer : !vm.ref<!hal.command_buffer>, %flags : i64, %binding_table : tuple<!vm.ref<!hal.buffer>, i64, i64> ...)
+    vm.import private @hal.device.queue.flush(%device : !vm.ref<!hal.device>, %queue_affinity : i64)
+    vm.import private @hal.devices.count() -> i32 attributes {nosideeffects}
+    vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects}
+    vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref<!hal.executable> attributes {nosideeffects}
+    vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i64) -> !vm.ref<!hal.fence>
+    vm.import private @hal.fence.join(%flags : i64, %fences : !vm.ref<!hal.fence> ...) -> !vm.ref<!hal.fence> attributes {nosideeffects}
+    vm.import private @hal.fence.query(%fence : !vm.ref<!hal.fence>) -> i32
+    vm.import private @hal.fence.signal(%fence : !vm.ref<!hal.fence>)
+    vm.import private @hal.fence.fail(%fence : !vm.ref<!hal.fence>, %status : i32)
+    vm.import private @hal.fence.await(%timeout_millis : i32, %flags : i64, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {vm.yield}
+    vm.func private @multiple_results(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> (!vm.ref<!hal.buffer_view>, !vm.ref<!hal.buffer_view>) attributes {iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+      %c3075 = vm.const.i32 3075
+      %c48 = vm.const.i32 48
+      %c2 = vm.const.i64 2
+      %c8 = vm.const.i64 8
+      %c64 = vm.const.i64 64
+      %c128 = vm.const.i64 128
+      %zero = vm.const.i64.zero
+      %c-1 = vm.const.i64 -1
+      %null = vm.const.ref.zero : !vm.ref<!hal.fence>
+      %zero_0 = vm.const.i64.zero
+      %c-1_1 = vm.const.i32 -1
+      %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref<!hal.device>
+      %__multiple_results_memoize_result_0_device_0 = vm.global.load.ref immutable @__multiple_results_memoize_result_0_device_0 : !vm.ref<!hal.command_buffer>
+      %c553648160 = vm.const.i32 553648160
+      %c1 = vm.const.i32 1
+      %buffer = vm.rodata.inline "_utf8_input0_DCE99660CEB3F6B" {alignment = 1 : i64} : !vm.buffer = "input0"
+      vm.call.variadic @hal.buffer_view.assert(%arg0, %buffer, %c553648160, %c1, [%c2]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
+      %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
+      %ref_2 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
+      %buffer_3 = vm.rodata.inline "_utf8_tensor_FC1814BC4A58F22A" {alignment = 1 : i64} : !vm.buffer = "tensor"
+      %c16 = vm.const.i32 16
+      %c3075_4 = vm.const.i32 3075
+      vm.call @hal.buffer.assert(%ref, %buffer_3, %ref_2, %c8, %c16, %c3075_4) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
+      %buffer_5 = vm.rodata.inline "_utf8_input1_B898B726583C85DA" {alignment = 1 : i64} : !vm.buffer = "input1"
+      vm.call.variadic @hal.buffer_view.assert(%arg1, %buffer_5, %c553648160, %c1, [%c2]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
+      %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
+      %buffer_7 = vm.rodata.inline "_utf8_tensor_FC1814BC4A58F22A" {alignment = 1 : i64} : !vm.buffer = "tensor"
+      %c16_8 = vm.const.i32 16
+      %c3075_9 = vm.const.i32 3075
+      vm.call @hal.buffer.assert(%ref_6, %buffer_7, %ref_2, %c8, %c16_8, %c3075_9) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
+      %zero_10 = vm.const.i64.zero
+      %ref_11 = vm.call @hal.fence.create(%__device_0, %zero_10) : (!vm.ref<!hal.device>, i64) -> !vm.ref<!hal.fence>
+      %zero_12 = vm.const.i64.zero
+      %ref_13 = vm.call @hal.device.queue.alloca(%__device_0, %c-1, %null, %ref_11, %zero_0, %c48, %c3075, %c128, %zero_12) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i64, i32, i32, i64, i64) -> !vm.ref<!hal.buffer>
+      %zero_14 = vm.const.i64.zero
+      %ref_15 = vm.call @hal.fence.create(%__device_0, %zero_14) : (!vm.ref<!hal.device>, i64) -> !vm.ref<!hal.fence>
+      %zero_16 = vm.const.i64 0
+      vm.call.variadic @hal.device.queue.execute.indirect(%__device_0, %c-1, %ref_11, %ref_15, %__multiple_results_memoize_result_0_device_0, %zero_16, [(%ref, %zero, %c8), (%ref_6, %zero, %c8), (%ref_13, %zero, %c128)]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer>, i64, tuple<!vm.ref<!hal.buffer>, i64, i64> ...)
+      %zero_17 = vm.const.i64.zero
+      %0 = vm.call.variadic @hal.fence.await(%c-1_1, %zero_17, [%ref_15]) : (i32, i64, !vm.ref<!hal.fence> ...) -> i32
+      vm.cond_fail %0, "failed to wait on timepoint"
+      %ref_18 = vm.call.variadic @hal.buffer_view.create(%ref_13, %zero, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
+      %ref_19 = vm.call.variadic @hal.buffer_view.create(%ref_13, %c64, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
+      vm.return %ref_18, %ref_19 : !vm.ref<!hal.buffer_view>, !vm.ref<!hal.buffer_view>
+    }
+    vm.export @multiple_results attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}}
+  }
+}
+
+
+// -----// IR Dump After ReifyRodataTablesPass (iree-vm-reify-rodata-tables) //----- //
+vm.module public @module {
+  vm.global.ref private @__device_0 : !vm.ref<!hal.device>
+  vm.global.ref private @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref<!hal.executable>
+  vm.global.ref private @__multiple_results_memoize_result_0_device_0 : !vm.ref<!hal.command_buffer>
+  vm.initializer {
+    %null = vm.const.ref.zero : !vm.ref<!hal.executable>
+    %c14 = vm.const.i32 14
+    %c-1 = vm.const.i64 -1
+    %c-1_0 = vm.const.i64 -1
+    %c18 = vm.const.i32 18
+    %zero = vm.const.i32.zero
+    %zero_1 = vm.const.i64.zero
+    %c1 = vm.const.i64 1
+    %null_2 = vm.const.ref.zero : !vm.ref<!hal.device>
+    %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
+    %1 = vm.ext.i32.i64.s %0 : i32 -> i64
+    vm.br ^bb1(%zero_1, %zero_1, %null_2 : i64, i64, !vm.ref<!hal.device>)
+  ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
+    %req = vm.cmp.eq.ref %4, %null_2 : !vm.ref<!hal.device>
+    %slt = vm.cmp.lt.i64.s %2, %1 : i64
+    %5 = vm.and.i32 %req, %slt : i32
+    vm.cond_br %5, ^bb2, ^bb5
+  ^bb2:  // pred: ^bb1
+    %6 = vm.trunc.i64.i32 %2 : i64 -> i32
+    %ref = vm.call @hal.devices.get(%6) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
+    %buffer = vm.rodata.inline "_utf8_hal_device_id_C6650FF277232B5A" {alignment = 1 : i64} : !vm.buffer = "hal.device.id"
+    %buffer_3 = vm.rodata.inline "_utf8_local_1A8FF0278D7661D8" {alignment = 1 : i64} : !vm.buffer = "local*"
+    %7:2 = vm.call @hal.device.query.i64(%ref, %buffer, %buffer_3) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
+    %nz = vm.cmp.nz.i64 %7#1 : i64
+    %zero_4 = vm.const.i32.zero
+    %8 = vm.select.i32 %7#0, %nz, %zero_4 : i32
+    %c1_5 = vm.const.i32 1
+    vm.cond_br %8, ^bb3, ^bb4(%zero : i32)
+  ^bb3:  // pred: ^bb2
+    %buffer_6 = vm.rodata.inline "_utf8_hal_executable_format_E03EECB63A2AAF52" {alignment = 1 : i64} : !vm.buffer = "hal.executable.format"
+    %buffer_7 = vm.rodata.inline "_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5" {alignment = 1 : i64} : !vm.buffer = "embedded-elf-arm_64"
+    %9:2 = vm.call @hal.device.query.i64(%ref, %buffer_6, %buffer_7) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
+    %nz_8 = vm.cmp.nz.i64 %9#1 : i64
+    %zero_9 = vm.const.i32.zero
+    %10 = vm.select.i32 %9#0, %nz_8, %zero_9 : i32
+    %c1_10 = vm.const.i32 1
+    vm.br ^bb4(%10 : i32)
+  ^bb4(%11: i32):  // 2 preds: ^bb2, ^bb3
+    %eq = vm.cmp.eq.i64 %3, %zero_1 : i64
+    %12 = vm.select.i64 %11, %c1, %zero_1 : i64
+    %13 = vm.add.i64 %3, %12 : i64
+    %14 = vm.and.i32 %11, %eq : i32
+    %ref_11 = vm.select.ref %14, %ref, %null_2 : !vm.ref<!hal.device>
+    %15 = vm.add.i64 %2, %c1 : i64
+    vm.br ^bb1(%15, %13, %ref_11 : i64, i64, !vm.ref<!hal.device>)
+  ^bb5:  // pred: ^bb1
+    vm.cond_br %req, ^bb6, ^bb7
+  ^bb6:  // pred: ^bb5
+    vm.cond_fail %c18, "HAL device `__device_0` not found or unavailable: #hal.device.target<"local", [#hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>]>"
+    vm.br ^bb7
+  ^bb7:  // 2 preds: ^bb5, ^bb6
+    %buffer_12 = vm.rodata.inline "_utf8_hal_executable_format_E03EECB63A2AAF52" {alignment = 1 : i64} : !vm.buffer = "hal.executable.format"
+    %buffer_13 = vm.rodata.inline "_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5" {alignment = 1 : i64} : !vm.buffer = "embedded-elf-arm_64"
+    %16:2 = vm.call @hal.device.query.i64(%4, %buffer_12, %buffer_13) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
+    %nz_14 = vm.cmp.nz.i64 %16#1 : i64
+    %zero_15 = vm.const.i32.zero
+    %17 = vm.select.i32 %16#0, %nz_14, %zero_15 : i32
+    %c1_16 = vm.const.i32 1
+    %18 = vm.select.i64 %17, %zero_1, %c-1 : i64
+    %eq_17 = vm.cmp.eq.i64 %18, %zero_1 : i64
+    vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
+    vm.cond_br %eq_17, ^bb8, ^bb9
+  ^bb8:  // pred: ^bb7
+    %buffer_18 = vm.rodata.inline "multiple_results_dispatch_0_embedded_elf_arm_64" {alignment = 16 : i64, mime_type = "application/x-elf"} : !vm.buffer = dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8>
+    %buffer_19 = vm.rodata.inline "_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5" {alignment = 1 : i64} : !vm.buffer = "embedded-elf-arm_64"
+    %null_20 = vm.const.ref.zero : !vm.buffer
+    %ref_21 = vm.call @hal.executable.create(%4, %c-1_0, %buffer_19, %buffer_18, %null_20) {nosideeffects} : (!vm.ref<!hal.device>, i64, !vm.buffer, !vm.buffer, !vm.buffer) -> !vm.ref<!hal.executable>
+    vm.br ^bb10(%ref_21 : !vm.ref<!hal.executable>)
+  ^bb9:  // pred: ^bb7
+    vm.cond_fail %c14, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+    vm.br ^bb10(%null : !vm.ref<!hal.executable>)
+  ^bb10(%19: !vm.ref<!hal.executable>):  // 2 preds: ^bb8, ^bb9
+    vm.global.store.ref %19, @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref<!hal.executable>
+    %ref_22 = vm.call @__multiple_results_memoize_apply() : () -> !vm.ref<!hal.command_buffer>
+    vm.global.store.ref %ref_22, @__multiple_results_memoize_result_0_device_0 : !vm.ref<!hal.command_buffer>
+    vm.return
+  }
+  vm.func private @__multiple_results_memoize_apply() -> !vm.ref<!hal.command_buffer> attributes {inlining_policy = #util.inline.never} {
+    %c1 = vm.const.i64 1
+    %c64 = vm.const.i32 64
+    %c128 = vm.const.i64 128
+    %c8 = vm.const.i64 8
+    %c2 = vm.const.i64 2
+    %zero = vm.const.i64.zero
+    %zero_0 = vm.const.i32.zero
+    %c3 = vm.const.i64 3
+    %c-1 = vm.const.i64 -1
+    %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref<!hal.device>
+    %__device_0_executable_0_multiple_results_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref<!hal.executable>
+    %zero_1 = vm.const.i32.zero
+    %c3_2 = vm.const.i32 3
+    %c3_3 = vm.const.i32 3
+    %ref = vm.call @hal.command_buffer.create(%__device_0, %zero_1, %c3_2, %c-1, %c3_3) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
+    %zero_4 = vm.const.i32.zero
+    %zero_5 = vm.const.i32.zero
+    %c1_6 = vm.const.i32 1
+    %c1_7 = vm.const.i32 1
+    %c1_8 = vm.const.i32 1
+    %zero_9 = vm.const.i64 0
+    %zero_10 = vm.const.i32.zero
+    %null = vm.const.ref.zero : !vm.ref<!hal.buffer>
+    %c2_11 = vm.const.i32 2
+    %null_12 = vm.const.ref.zero : !vm.ref<!hal.buffer>
+    vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_5, %c1_6, %c1_7, %c1_8, %zero_9, [%zero_0], [(%zero_4, %zero_10, %null, %zero, %c8), (%zero_4, %c2_11, %null_12, %zero, %c128)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64, i32 ..., tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
+    %zero_13 = vm.const.i32.zero
+    %zero_14 = vm.const.i32.zero
+    %c1_15 = vm.const.i32 1
+    %c1_16 = vm.const.i32 1
+    %c1_17 = vm.const.i32 1
+    %zero_18 = vm.const.i64 0
+    %c1_19 = vm.const.i32 1
+    %null_20 = vm.const.ref.zero : !vm.ref<!hal.buffer>
+    %c2_21 = vm.const.i32 2
+    %null_22 = vm.const.ref.zero : !vm.ref<!hal.buffer>
+    vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_14, %c1_15, %c1_16, %c1_17, %zero_18, [%c64], [(%zero_13, %c1_19, %null_20, %zero, %c8), (%zero_13, %c2_21, %null_22, %zero, %c128)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64, i32 ..., tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
+    %c28 = vm.const.i32 28
+    %c13 = vm.const.i32 13
+    %zero_23 = vm.const.i64.zero
+    vm.call @hal.command_buffer.execution_barrier(%ref, %c28, %c13, %zero_23) : (!vm.ref<!hal.command_buffer>, i32, i32, i64) -> ()
+    vm.call @hal.command_buffer.finalize(%ref) : (!vm.ref<!hal.command_buffer>) -> ()
+    vm.return %ref : !vm.ref<!hal.command_buffer>
+  }
+  vm.import private @hal.ex.file.from_memory(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %access : i32, %buffer : !vm.buffer, %offset : i64, %length : i64, %flags : i32) -> !vm.ref<!hal.file>
+  vm.import private @hal.allocator.select(%memory_types : i32, %buffer_usage : i32, %flags : i64, %from : tuple<!vm.ref<!hal.device>, i64> ...) -> (!vm.ref<!hal.device>, i64) attributes {nosideeffects}
+  vm.import private @hal.allocator.allocate(%allocator : !vm.ref<!hal.allocator>, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
+  vm.import private @hal.allocator.import(%allocator : !vm.ref<!hal.allocator>, %try : i32, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %source : !vm.buffer, %offset : i64, %length : i64) -> !vm.ref<!hal.buffer>
+  vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32)
+  vm.import private @hal.buffer.allocation.preserve(%buffer : !vm.ref<!hal.buffer>)
+  vm.import private @hal.buffer.allocation.discard(%buffer : !vm.ref<!hal.buffer>) -> i32
+  vm.import private @hal.buffer.allocation.is_terminal(%buffer : !vm.ref<!hal.buffer>) -> i32
+  vm.import private @hal.buffer.subspan(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %length : i64) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
+  vm.import private @hal.buffer.length(%buffer : !vm.ref<!hal.buffer>) -> i64 attributes {nosideeffects}
+  vm.import private @hal.buffer.load(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %length : i32) -> i32
+  vm.import private @hal.buffer.store(%value : i32, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i32)
+  vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects}
+  vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...)
+  vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
+  vm.import private @hal.buffer_view.element_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects}
+  vm.import private @hal.buffer_view.encoding_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects}
+  vm.import private @hal.buffer_view.rank(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects}
+  vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i64 attributes {nosideeffects}
+  vm.import private @hal.buffer_view.trace(%key : !vm.buffer, %operands : !vm.ref<!hal.buffer_view> ...)
+  vm.import private @hal.channel.create(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %flags : i64, %id : !vm.buffer, %group : !vm.buffer, %rank : i32, %count : i32) -> !vm.ref<!hal.channel> attributes {nosideeffects}
+  vm.import private @hal.channel.split(%channel : !vm.ref<!hal.channel>, %color : i32, %key : i32, %flags : i64) -> !vm.ref<!hal.channel> attributes {nosideeffects}
+  vm.import private @hal.channel.rank_and_count(%channel : !vm.ref<!hal.channel>) -> (i32, i32) attributes {nosideeffects}
+  vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 6 : i32}
+  vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>)
+  vm.import private @hal.command_buffer.begin_debug_group(%command_buffer : !vm.ref<!hal.command_buffer>, %label : !vm.buffer)
+  vm.import private @hal.command_buffer.end_debug_group(%command_buffer : !vm.ref<!hal.command_buffer>)
+  vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i64)
+  vm.import private @hal.command_buffer.advise_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %buffer : !vm.ref<!hal.buffer>, %flags : i64, %arg0 : i64, %arg1 : i64, %buffer_slot : i32)
+  vm.import private @hal.command_buffer.fill_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %target_buffer_slot : i32, %pattern : i64, %pattern_length : i32, %flags : i64)
+  vm.import private @hal.command_buffer.update_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %source_buffer : !vm.buffer, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %target_buffer_slot : i32, %flags : i64)
+  vm.import private @hal.command_buffer.copy_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %source_buffer_slot : i32, %target_buffer_slot : i32, %source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %flags : i64)
+  vm.import private @hal.command_buffer.collective(%command_buffer : !vm.ref<!hal.command_buffer>, %channel : !vm.ref<!hal.channel>, %op : i32, %param : i32, %send_buffer_slot : i32, %recv_buffer_slot : i32, %send_buffer : !vm.ref<!hal.buffer>, %recv_buffer : !vm.ref<!hal.buffer>, %send_offset : i64, %send_length : i64, %recv_offset : i64, %recv_length : i64, %element_count : i64)
+  vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
+  vm.import private @hal.command_buffer.dispatch.indirect(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref<!hal.buffer>, %workgroups_offset : i64, %flags : i64, %constants : i32 ..., %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
+  vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects}
+  vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects}
+  vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64, %flags : i64) -> !vm.ref<!hal.buffer>
+  vm.import private @hal.device.queue.dealloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %buffer : !vm.ref<!hal.buffer>, %flags : i64)
+  vm.import private @hal.device.queue.fill(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %pattern : i64, %pattern_length : i32, %flags : i64)
+  vm.import private @hal.device.queue.update(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %source_buffer : !vm.buffer, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %flags : i64)
+  vm.import private @hal.device.queue.copy(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %flags : i64)
+  vm.import private @hal.device.queue.read(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %source_file : !vm.ref<!hal.file>, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %flags : i64)
+  vm.import private @hal.device.queue.write(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %target_file : !vm.ref<!hal.file>, %target_offset : i64, %length : i64, %flags : i64)
+  vm.import private @hal.device.queue.barrier(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %flags : i64)
+  vm.import private @hal.device.queue.execute(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffer : !vm.ref<!hal.command_buffer>, %flags : i64)
+  vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffer : !vm.ref<!hal.command_buffer>, %flags : i64, %binding_table : tuple<!vm.ref<!hal.buffer>, i64, i64> ...)
+  vm.import private @hal.device.queue.flush(%device : !vm.ref<!hal.device>, %queue_affinity : i64)
+  vm.import private @hal.devices.count() -> i32 attributes {nosideeffects}
+  vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects}
+  vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref<!hal.executable> attributes {nosideeffects}
+  vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i64) -> !vm.ref<!hal.fence>
+  vm.import private @hal.fence.join(%flags : i64, %fences : !vm.ref<!hal.fence> ...) -> !vm.ref<!hal.fence> attributes {nosideeffects}
+  vm.import private @hal.fence.query(%fence : !vm.ref<!hal.fence>) -> i32
+  vm.import private @hal.fence.signal(%fence : !vm.ref<!hal.fence>)
+  vm.import private @hal.fence.fail(%fence : !vm.ref<!hal.fence>, %status : i32)
+  vm.import private @hal.fence.await(%timeout_millis : i32, %flags : i64, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {vm.yield}
+  vm.func private @multiple_results(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> (!vm.ref<!hal.buffer_view>, !vm.ref<!hal.buffer_view>) attributes {iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c3075 = vm.const.i32 3075
+    %c48 = vm.const.i32 48
+    %c2 = vm.const.i64 2
+    %c8 = vm.const.i64 8
+    %c64 = vm.const.i64 64
+    %c128 = vm.const.i64 128
+    %zero = vm.const.i64.zero
+    %c-1 = vm.const.i64 -1
+    %null = vm.const.ref.zero : !vm.ref<!hal.fence>
+    %zero_0 = vm.const.i64.zero
+    %c-1_1 = vm.const.i32 -1
+    %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref<!hal.device>
+    %__multiple_results_memoize_result_0_device_0 = vm.global.load.ref immutable @__multiple_results_memoize_result_0_device_0 : !vm.ref<!hal.command_buffer>
+    %c553648160 = vm.const.i32 553648160
+    %c1 = vm.const.i32 1
+    %buffer = vm.rodata.inline "_utf8_input0_DCE99660CEB3F6B" {alignment = 1 : i64} : !vm.buffer = "input0"
+    vm.call.variadic @hal.buffer_view.assert(%arg0, %buffer, %c553648160, %c1, [%c2]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
+    %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
+    %ref_2 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
+    %buffer_3 = vm.rodata.inline "_utf8_tensor_FC1814BC4A58F22A" {alignment = 1 : i64} : !vm.buffer = "tensor"
+    %c16 = vm.const.i32 16
+    %c3075_4 = vm.const.i32 3075
+    vm.call @hal.buffer.assert(%ref, %buffer_3, %ref_2, %c8, %c16, %c3075_4) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
+    %buffer_5 = vm.rodata.inline "_utf8_input1_B898B726583C85DA" {alignment = 1 : i64} : !vm.buffer = "input1"
+    vm.call.variadic @hal.buffer_view.assert(%arg1, %buffer_5, %c553648160, %c1, [%c2]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
+    %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
+    %buffer_7 = vm.rodata.inline "_utf8_tensor_FC1814BC4A58F22A" {alignment = 1 : i64} : !vm.buffer = "tensor"
+    %c16_8 = vm.const.i32 16
+    %c3075_9 = vm.const.i32 3075
+    vm.call @hal.buffer.assert(%ref_6, %buffer_7, %ref_2, %c8, %c16_8, %c3075_9) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
+    %zero_10 = vm.const.i64.zero
+    %ref_11 = vm.call @hal.fence.create(%__device_0, %zero_10) : (!vm.ref<!hal.device>, i64) -> !vm.ref<!hal.fence>
+    %zero_12 = vm.const.i64.zero
+    %ref_13 = vm.call @hal.device.queue.alloca(%__device_0, %c-1, %null, %ref_11, %zero_0, %c48, %c3075, %c128, %zero_12) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i64, i32, i32, i64, i64) -> !vm.ref<!hal.buffer>
+    %zero_14 = vm.const.i64.zero
+    %ref_15 = vm.call @hal.fence.create(%__device_0, %zero_14) : (!vm.ref<!hal.device>, i64) -> !vm.ref<!hal.fence>
+    %zero_16 = vm.const.i64 0
+    vm.call.variadic @hal.device.queue.execute.indirect(%__device_0, %c-1, %ref_11, %ref_15, %__multiple_results_memoize_result_0_device_0, %zero_16, [(%ref, %zero, %c8), (%ref_6, %zero, %c8), (%ref_13, %zero, %c128)]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer>, i64, tuple<!vm.ref<!hal.buffer>, i64, i64> ...)
+    %zero_17 = vm.const.i64.zero
+    %0 = vm.call.variadic @hal.fence.await(%c-1_1, %zero_17, [%ref_15]) : (i32, i64, !vm.ref<!hal.fence> ...) -> i32
+    vm.cond_fail %0, "failed to wait on timepoint"
+    %ref_18 = vm.call.variadic @hal.buffer_view.create(%ref_13, %zero, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
+    %ref_19 = vm.call.variadic @hal.buffer_view.create(%ref_13, %c64, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
+    vm.return %ref_18, %ref_19 : !vm.ref<!hal.buffer_view>, !vm.ref<!hal.buffer_view>
+  }
+  vm.export @multiple_results attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}}
+}
+
+// -----// IR Dump After HoistInlinedRodataPass (iree-vm-hoist-inlined-rodata) //----- //
+vm.module public @module {
+  vm.global.ref private @__device_0 : !vm.ref<!hal.device>
+  vm.global.ref private @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref<!hal.executable>
+  vm.global.ref private @__multiple_results_memoize_result_0_device_0 : !vm.ref<!hal.command_buffer>
+  vm.rodata private @_utf8_hal_device_id_C6650FF277232B5A {alignment = 1 : i64} "hal.device.id"
+  vm.rodata private @_utf8_local_1A8FF0278D7661D8 {alignment = 1 : i64} "local*"
+  vm.rodata private @_utf8_hal_executable_format_E03EECB63A2AAF52 {alignment = 1 : i64} "hal.executable.format"
+  vm.rodata private @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 {alignment = 1 : i64} "embedded-elf-arm_64"
+  vm.rodata private @_utf8_hal_executable_format_E03EECB63A2AAF52_0 {alignment = 1 : i64} "hal.executable.format"
+  vm.rodata private @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_1 {alignment = 1 : i64} "embedded-elf-arm_64"
+  vm.rodata private @multiple_results_dispatch_0_embedded_elf_arm_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8>
+  vm.rodata private @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_2 {alignment = 1 : i64} "embedded-elf-arm_64"
+  vm.initializer {
+    %null = vm.const.ref.zero : !vm.ref<!hal.executable>
+    %c14 = vm.const.i32 14
+    %c-1 = vm.const.i64 -1
+    %c-1_0 = vm.const.i64 -1
+    %c18 = vm.const.i32 18
+    %zero = vm.const.i32.zero
+    %zero_1 = vm.const.i64.zero
+    %c1 = vm.const.i64 1
+    %null_2 = vm.const.ref.zero : !vm.ref<!hal.device>
+    %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
+    %1 = vm.ext.i32.i64.s %0 : i32 -> i64
+    vm.br ^bb1(%zero_1, %zero_1, %null_2 : i64, i64, !vm.ref<!hal.device>)
+  ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
+    %req = vm.cmp.eq.ref %4, %null_2 : !vm.ref<!hal.device>
+    %slt = vm.cmp.lt.i64.s %2, %1 : i64
+    %5 = vm.and.i32 %req, %slt : i32
+    vm.cond_br %5, ^bb2, ^bb5
+  ^bb2:  // pred: ^bb1
+    %6 = vm.trunc.i64.i32 %2 : i64 -> i32
+    %ref = vm.call @hal.devices.get(%6) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
+    %_utf8_hal_device_id_C6650FF277232B5A = vm.const.ref.rodata @_utf8_hal_device_id_C6650FF277232B5A : !vm.buffer
+    %_utf8_local_1A8FF0278D7661D8 = vm.const.ref.rodata @_utf8_local_1A8FF0278D7661D8 : !vm.buffer
+    %7:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_C6650FF277232B5A, %_utf8_local_1A8FF0278D7661D8) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
+    %nz = vm.cmp.nz.i64 %7#1 : i64
+    %zero_3 = vm.const.i32.zero
+    %8 = vm.select.i32 %7#0, %nz, %zero_3 : i32
+    %c1_4 = vm.const.i32 1
+    vm.cond_br %8, ^bb3, ^bb4(%zero : i32)
+  ^bb3:  // pred: ^bb2
+    %_utf8_hal_executable_format_E03EECB63A2AAF52 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer
+    %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer
+    %9:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_E03EECB63A2AAF52, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
+    %nz_5 = vm.cmp.nz.i64 %9#1 : i64
+    %zero_6 = vm.const.i32.zero
+    %10 = vm.select.i32 %9#0, %nz_5, %zero_6 : i32
+    %c1_7 = vm.const.i32 1
+    vm.br ^bb4(%10 : i32)
+  ^bb4(%11: i32):  // 2 preds: ^bb2, ^bb3
+    %eq = vm.cmp.eq.i64 %3, %zero_1 : i64
+    %12 = vm.select.i64 %11, %c1, %zero_1 : i64
+    %13 = vm.add.i64 %3, %12 : i64
+    %14 = vm.and.i32 %11, %eq : i32
+    %ref_8 = vm.select.ref %14, %ref, %null_2 : !vm.ref<!hal.device>
+    %15 = vm.add.i64 %2, %c1 : i64
+    vm.br ^bb1(%15, %13, %ref_8 : i64, i64, !vm.ref<!hal.device>)
+  ^bb5:  // pred: ^bb1
+    vm.cond_br %req, ^bb6, ^bb7
+  ^bb6:  // pred: ^bb5
+    vm.cond_fail %c18, "HAL device `__device_0` not found or unavailable: #hal.device.target<"local", [#hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>]>"
+    vm.br ^bb7
+  ^bb7:  // 2 preds: ^bb5, ^bb6
+    %_utf8_hal_executable_format_E03EECB63A2AAF52_0 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52_0 : !vm.buffer
+    %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_1 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_1 : !vm.buffer
+    %16:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_E03EECB63A2AAF52_0, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_1) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
+    %nz_9 = vm.cmp.nz.i64 %16#1 : i64
+    %zero_10 = vm.const.i32.zero
+    %17 = vm.select.i32 %16#0, %nz_9, %zero_10 : i32
+    %c1_11 = vm.const.i32 1
+    %18 = vm.select.i64 %17, %zero_1, %c-1 : i64
+    %eq_12 = vm.cmp.eq.i64 %18, %zero_1 : i64
+    vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
+    vm.cond_br %eq_12, ^bb8, ^bb9
+  ^bb8:  // pred: ^bb7
+    %multiple_results_dispatch_0_embedded_elf_arm_64 = vm.const.ref.rodata @multiple_results_dispatch_0_embedded_elf_arm_64 : !vm.buffer
+    %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_2 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_2 : !vm.buffer
+    %null_13 = vm.const.ref.zero : !vm.buffer
+    %ref_14 = vm.call @hal.executable.create(%4, %c-1_0, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_2, %multiple_results_dispatch_0_embedded_elf_arm_64, %null_13) {nosideeffects} : (!vm.ref<!hal.device>, i64, !vm.buffer, !vm.buffer, !vm.buffer) -> !vm.ref<!hal.executable>
+    vm.br ^bb10(%ref_14 : !vm.ref<!hal.executable>)
+  ^bb9:  // pred: ^bb7
+    vm.cond_fail %c14, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+    vm.br ^bb10(%null : !vm.ref<!hal.executable>)
+  ^bb10(%19: !vm.ref<!hal.executable>):  // 2 preds: ^bb8, ^bb9
+    vm.global.store.ref %19, @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref<!hal.executable>
+    %ref_15 = vm.call @__multiple_results_memoize_apply() : () -> !vm.ref<!hal.command_buffer>
+    vm.global.store.ref %ref_15, @__multiple_results_memoize_result_0_device_0 : !vm.ref<!hal.command_buffer>
+    vm.return
+  }
+  vm.func private @__multiple_results_memoize_apply() -> !vm.ref<!hal.command_buffer> attributes {inlining_policy = #util.inline.never} {
+    %c1 = vm.const.i64 1
+    %c64 = vm.const.i32 64
+    %c128 = vm.const.i64 128
+    %c8 = vm.const.i64 8
+    %c2 = vm.const.i64 2
+    %zero = vm.const.i64.zero
+    %zero_0 = vm.const.i32.zero
+    %c3 = vm.const.i64 3
+    %c-1 = vm.const.i64 -1
+    %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref<!hal.device>
+    %__device_0_executable_0_multiple_results_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref<!hal.executable>
+    %zero_1 = vm.const.i32.zero
+    %c3_2 = vm.const.i32 3
+    %c3_3 = vm.const.i32 3
+    %ref = vm.call @hal.command_buffer.create(%__device_0, %zero_1, %c3_2, %c-1, %c3_3) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
+    %zero_4 = vm.const.i32.zero
+    %zero_5 = vm.const.i32.zero
+    %c1_6 = vm.const.i32 1
+    %c1_7 = vm.const.i32 1
+    %c1_8 = vm.const.i32 1
+    %zero_9 = vm.const.i64 0
+    %zero_10 = vm.const.i32.zero
+    %null = vm.const.ref.zero : !vm.ref<!hal.buffer>
+    %c2_11 = vm.const.i32 2
+    %null_12 = vm.const.ref.zero : !vm.ref<!hal.buffer>
+    vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_5, %c1_6, %c1_7, %c1_8, %zero_9, [%zero_0], [(%zero_4, %zero_10, %null, %zero, %c8), (%zero_4, %c2_11, %null_12, %zero, %c128)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64, i32 ..., tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
+    %zero_13 = vm.const.i32.zero
+    %zero_14 = vm.const.i32.zero
+    %c1_15 = vm.const.i32 1
+    %c1_16 = vm.const.i32 1
+    %c1_17 = vm.const.i32 1
+    %zero_18 = vm.const.i64 0
+    %c1_19 = vm.const.i32 1
+    %null_20 = vm.const.ref.zero : !vm.ref<!hal.buffer>
+    %c2_21 = vm.const.i32 2
+    %null_22 = vm.const.ref.zero : !vm.ref<!hal.buffer>
+    vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_14, %c1_15, %c1_16, %c1_17, %zero_18, [%c64], [(%zero_13, %c1_19, %null_20, %zero, %c8), (%zero_13, %c2_21, %null_22, %zero, %c128)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64, i32 ..., tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
+    %c28 = vm.const.i32 28
+    %c13 = vm.const.i32 13
+    %zero_23 = vm.const.i64.zero
+    vm.call @hal.command_buffer.execution_barrier(%ref, %c28, %c13, %zero_23) : (!vm.ref<!hal.command_buffer>, i32, i32, i64) -> ()
+    vm.call @hal.command_buffer.finalize(%ref) : (!vm.ref<!hal.command_buffer>) -> ()
+    vm.return %ref : !vm.ref<!hal.command_buffer>
+  }
+  vm.import private @hal.ex.file.from_memory(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %access : i32, %buffer : !vm.buffer, %offset : i64, %length : i64, %flags : i32) -> !vm.ref<!hal.file>
+  vm.import private @hal.allocator.select(%memory_types : i32, %buffer_usage : i32, %flags : i64, %from : tuple<!vm.ref<!hal.device>, i64> ...) -> (!vm.ref<!hal.device>, i64) attributes {nosideeffects}
+  vm.import private @hal.allocator.allocate(%allocator : !vm.ref<!hal.allocator>, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
+  vm.import private @hal.allocator.import(%allocator : !vm.ref<!hal.allocator>, %try : i32, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %source : !vm.buffer, %offset : i64, %length : i64) -> !vm.ref<!hal.buffer>
+  vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32)
+  vm.import private @hal.buffer.allocation.preserve(%buffer : !vm.ref<!hal.buffer>)
+  vm.import private @hal.buffer.allocation.discard(%buffer : !vm.ref<!hal.buffer>) -> i32
+  vm.import private @hal.buffer.allocation.is_terminal(%buffer : !vm.ref<!hal.buffer>) -> i32
+  vm.import private @hal.buffer.subspan(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %length : i64) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
+  vm.import private @hal.buffer.length(%buffer : !vm.ref<!hal.buffer>) -> i64 attributes {nosideeffects}
+  vm.import private @hal.buffer.load(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %length : i32) -> i32
+  vm.import private @hal.buffer.store(%value : i32, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i32)
+  vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects}
+  vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...)
+  vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
+  vm.import private @hal.buffer_view.element_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects}
+  vm.import private @hal.buffer_view.encoding_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects}
+  vm.import private @hal.buffer_view.rank(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects}
+  vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i64 attributes {nosideeffects}
+  vm.import private @hal.buffer_view.trace(%key : !vm.buffer, %operands : !vm.ref<!hal.buffer_view> ...)
+  vm.import private @hal.channel.create(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %flags : i64, %id : !vm.buffer, %group : !vm.buffer, %rank : i32, %count : i32) -> !vm.ref<!hal.channel> attributes {nosideeffects}
+  vm.import private @hal.channel.split(%channel : !vm.ref<!hal.channel>, %color : i32, %key : i32, %flags : i64) -> !vm.ref<!hal.channel> attributes {nosideeffects}
+  vm.import private @hal.channel.rank_and_count(%channel : !vm.ref<!hal.channel>) -> (i32, i32) attributes {nosideeffects}
+  vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 6 : i32}
+  vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>)
+  vm.import private @hal.command_buffer.begin_debug_group(%command_buffer : !vm.ref<!hal.command_buffer>, %label : !vm.buffer)
+  vm.import private @hal.command_buffer.end_debug_group(%command_buffer : !vm.ref<!hal.command_buffer>)
+  vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i64)
+  vm.import private @hal.command_buffer.advise_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %buffer : !vm.ref<!hal.buffer>, %flags : i64, %arg0 : i64, %arg1 : i64, %buffer_slot : i32)
+  vm.import private @hal.command_buffer.fill_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %target_buffer_slot : i32, %pattern : i64, %pattern_length : i32, %flags : i64)
+  vm.import private @hal.command_buffer.update_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %source_buffer : !vm.buffer, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %target_buffer_slot : i32, %flags : i64)
+  vm.import private @hal.command_buffer.copy_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %source_buffer_slot : i32, %target_buffer_slot : i32, %source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %flags : i64)
+  vm.import private @hal.command_buffer.collective(%command_buffer : !vm.ref<!hal.command_buffer>, %channel : !vm.ref<!hal.channel>, %op : i32, %param : i32, %send_buffer_slot : i32, %recv_buffer_slot : i32, %send_buffer : !vm.ref<!hal.buffer>, %recv_buffer : !vm.ref<!hal.buffer>, %send_offset : i64, %send_length : i64, %recv_offset : i64, %recv_length : i64, %element_count : i64)
+  vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
+  vm.import private @hal.command_buffer.dispatch.indirect(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref<!hal.buffer>, %workgroups_offset : i64, %flags : i64, %constants : i32 ..., %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
+  vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects}
+  vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects}
+  vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64, %flags : i64) -> !vm.ref<!hal.buffer>
+  vm.import private @hal.device.queue.dealloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %buffer : !vm.ref<!hal.buffer>, %flags : i64)
+  vm.import private @hal.device.queue.fill(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %pattern : i64, %pattern_length : i32, %flags : i64)
+  vm.import private @hal.device.queue.update(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %source_buffer : !vm.buffer, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %flags : i64)
+  vm.import private @hal.device.queue.copy(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %flags : i64)
+  vm.import private @hal.device.queue.read(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %source_file : !vm.ref<!hal.file>, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %flags : i64)
+  vm.import private @hal.device.queue.write(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %target_file : !vm.ref<!hal.file>, %target_offset : i64, %length : i64, %flags : i64)
+  vm.import private @hal.device.queue.barrier(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %flags : i64)
+  vm.import private @hal.device.queue.execute(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffer : !vm.ref<!hal.command_buffer>, %flags : i64)
+  vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffer : !vm.ref<!hal.command_buffer>, %flags : i64, %binding_table : tuple<!vm.ref<!hal.buffer>, i64, i64> ...)
+  vm.import private @hal.device.queue.flush(%device : !vm.ref<!hal.device>, %queue_affinity : i64)
+  vm.import private @hal.devices.count() -> i32 attributes {nosideeffects}
+  vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects}
+  vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref<!hal.executable> attributes {nosideeffects}
+  vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i64) -> !vm.ref<!hal.fence>
+  vm.import private @hal.fence.join(%flags : i64, %fences : !vm.ref<!hal.fence> ...) -> !vm.ref<!hal.fence> attributes {nosideeffects}
+  vm.import private @hal.fence.query(%fence : !vm.ref<!hal.fence>) -> i32
+  vm.import private @hal.fence.signal(%fence : !vm.ref<!hal.fence>)
+  vm.import private @hal.fence.fail(%fence : !vm.ref<!hal.fence>, %status : i32)
+  vm.import private @hal.fence.await(%timeout_millis : i32, %flags : i64, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {vm.yield}
+  vm.rodata private @_utf8_input0_DCE99660CEB3F6B {alignment = 1 : i64} "input0"
+  vm.rodata private @_utf8_tensor_FC1814BC4A58F22A {alignment = 1 : i64} "tensor"
+  vm.rodata private @_utf8_input1_B898B726583C85DA {alignment = 1 : i64} "input1"
+  vm.rodata private @_utf8_tensor_FC1814BC4A58F22A_3 {alignment = 1 : i64} "tensor"
+  vm.func private @multiple_results(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> (!vm.ref<!hal.buffer_view>, !vm.ref<!hal.buffer_view>) attributes {iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c3075 = vm.const.i32 3075
+    %c48 = vm.const.i32 48
+    %c2 = vm.const.i64 2
+    %c8 = vm.const.i64 8
+    %c64 = vm.const.i64 64
+    %c128 = vm.const.i64 128
+    %zero = vm.const.i64.zero
+    %c-1 = vm.const.i64 -1
+    %null = vm.const.ref.zero : !vm.ref<!hal.fence>
+    %zero_0 = vm.const.i64.zero
+    %c-1_1 = vm.const.i32 -1
+    %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref<!hal.device>
+    %__multiple_results_memoize_result_0_device_0 = vm.global.load.ref immutable @__multiple_results_memoize_result_0_device_0 : !vm.ref<!hal.command_buffer>
+    %c553648160 = vm.const.i32 553648160
+    %c1 = vm.const.i32 1
+    %_utf8_input0_DCE99660CEB3F6B = vm.const.ref.rodata @_utf8_input0_DCE99660CEB3F6B : !vm.buffer
+    vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DCE99660CEB3F6B, %c553648160, %c1, [%c2]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
+    %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
+    %ref_2 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
+    %_utf8_tensor_FC1814BC4A58F22A = vm.const.ref.rodata @_utf8_tensor_FC1814BC4A58F22A : !vm.buffer
+    %c16 = vm.const.i32 16
+    %c3075_3 = vm.const.i32 3075
+    vm.call @hal.buffer.assert(%ref, %_utf8_tensor_FC1814BC4A58F22A, %ref_2, %c8, %c16, %c3075_3) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
+    %_utf8_input1_B898B726583C85DA = vm.const.ref.rodata @_utf8_input1_B898B726583C85DA : !vm.buffer
+    vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_B898B726583C85DA, %c553648160, %c1, [%c2]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
+    %ref_4 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
+    %_utf8_tensor_FC1814BC4A58F22A_3 = vm.const.ref.rodata @_utf8_tensor_FC1814BC4A58F22A_3 : !vm.buffer
+    %c16_5 = vm.const.i32 16
+    %c3075_6 = vm.const.i32 3075
+    vm.call @hal.buffer.assert(%ref_4, %_utf8_tensor_FC1814BC4A58F22A_3, %ref_2, %c8, %c16_5, %c3075_6) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
+    %zero_7 = vm.const.i64.zero
+    %ref_8 = vm.call @hal.fence.create(%__device_0, %zero_7) : (!vm.ref<!hal.device>, i64) -> !vm.ref<!hal.fence>
+    %zero_9 = vm.const.i64.zero
+    %ref_10 = vm.call @hal.device.queue.alloca(%__device_0, %c-1, %null, %ref_8, %zero_0, %c48, %c3075, %c128, %zero_9) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i64, i32, i32, i64, i64) -> !vm.ref<!hal.buffer>
+    %zero_11 = vm.const.i64.zero
+    %ref_12 = vm.call @hal.fence.create(%__device_0, %zero_11) : (!vm.ref<!hal.device>, i64) -> !vm.ref<!hal.fence>
+    %zero_13 = vm.const.i64 0
+    vm.call.variadic @hal.device.queue.execute.indirect(%__device_0, %c-1, %ref_8, %ref_12, %__multiple_results_memoize_result_0_device_0, %zero_13, [(%ref, %zero, %c8), (%ref_4, %zero, %c8), (%ref_10, %zero, %c128)]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer>, i64, tuple<!vm.ref<!hal.buffer>, i64, i64> ...)
+    %zero_14 = vm.const.i64.zero
+    %0 = vm.call.variadic @hal.fence.await(%c-1_1, %zero_14, [%ref_12]) : (i32, i64, !vm.ref<!hal.fence> ...) -> i32
+    vm.cond_fail %0, "failed to wait on timepoint"
+    %ref_15 = vm.call.variadic @hal.buffer_view.create(%ref_10, %zero, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
+    %ref_16 = vm.call.variadic @hal.buffer_view.create(%ref_10, %c64, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
+    vm.return %ref_15, %ref_16 : !vm.ref<!hal.buffer_view>, !vm.ref<!hal.buffer_view>
+  }
+  vm.export @multiple_results attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}}
+}
+
+// -----// IR Dump After DeduplicateRodataPass (iree-vm-deduplicate-rodata) //----- //
+vm.module public @module {
+  vm.global.ref private @__device_0 : !vm.ref<!hal.device>
+  vm.global.ref private @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref<!hal.executable>
+  vm.global.ref private @__multiple_results_memoize_result_0_device_0 : !vm.ref<!hal.command_buffer>
+  vm.rodata private @_utf8_hal_device_id_C6650FF277232B5A {alignment = 1 : i64} "hal.device.id"
+  vm.rodata private @_utf8_local_1A8FF0278D7661D8 {alignment = 1 : i64} "local*"
+  vm.rodata private @_utf8_hal_executable_format_E03EECB63A2AAF52 {alignment = 1 : i64} "hal.executable.format"
+  vm.rodata private @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 {alignment = 1 : i64} "embedded-elf-arm_64"
+  vm.rodata private @multiple_results_dispatch_0_embedded_elf_arm_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8>
+  vm.initializer {
+    %null = vm.const.ref.zero : !vm.ref<!hal.executable>
+    %c14 = vm.const.i32 14
+    %c-1 = vm.const.i64 -1
+    %c-1_0 = vm.const.i64 -1
+    %c18 = vm.const.i32 18
+    %zero = vm.const.i32.zero
+    %zero_1 = vm.const.i64.zero
+    %c1 = vm.const.i64 1
+    %null_2 = vm.const.ref.zero : !vm.ref<!hal.device>
+    %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
+    %1 = vm.ext.i32.i64.s %0 : i32 -> i64
+    vm.br ^bb1(%zero_1, %zero_1, %null_2 : i64, i64, !vm.ref<!hal.device>)
+  ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
+    %req = vm.cmp.eq.ref %4, %null_2 : !vm.ref<!hal.device>
+    %slt = vm.cmp.lt.i64.s %2, %1 : i64
+    %5 = vm.and.i32 %req, %slt : i32
+    vm.cond_br %5, ^bb2, ^bb5
+  ^bb2:  // pred: ^bb1
+    %6 = vm.trunc.i64.i32 %2 : i64 -> i32
+    %ref = vm.call @hal.devices.get(%6) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
+    %_utf8_hal_device_id_C6650FF277232B5A = vm.const.ref.rodata @_utf8_hal_device_id_C6650FF277232B5A : !vm.buffer
+    %_utf8_local_1A8FF0278D7661D8 = vm.const.ref.rodata @_utf8_local_1A8FF0278D7661D8 : !vm.buffer
+    %7:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_C6650FF277232B5A, %_utf8_local_1A8FF0278D7661D8) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
+    %nz = vm.cmp.nz.i64 %7#1 : i64
+    %zero_3 = vm.const.i32.zero
+    %8 = vm.select.i32 %7#0, %nz, %zero_3 : i32
+    %c1_4 = vm.const.i32 1
+    vm.cond_br %8, ^bb3, ^bb4(%zero : i32)
+  ^bb3:  // pred: ^bb2
+    %_utf8_hal_executable_format_E03EECB63A2AAF52 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer
+    %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer
+    %9:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_E03EECB63A2AAF52, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
+    %nz_5 = vm.cmp.nz.i64 %9#1 : i64
+    %zero_6 = vm.const.i32.zero
+    %10 = vm.select.i32 %9#0, %nz_5, %zero_6 : i32
+    %c1_7 = vm.const.i32 1
+    vm.br ^bb4(%10 : i32)
+  ^bb4(%11: i32):  // 2 preds: ^bb2, ^bb3
+    %eq = vm.cmp.eq.i64 %3, %zero_1 : i64
+    %12 = vm.select.i64 %11, %c1, %zero_1 : i64
+    %13 = vm.add.i64 %3, %12 : i64
+    %14 = vm.and.i32 %11, %eq : i32
+    %ref_8 = vm.select.ref %14, %ref, %null_2 : !vm.ref<!hal.device>
+    %15 = vm.add.i64 %2, %c1 : i64
+    vm.br ^bb1(%15, %13, %ref_8 : i64, i64, !vm.ref<!hal.device>)
+  ^bb5:  // pred: ^bb1
+    vm.cond_br %req, ^bb6, ^bb7
+  ^bb6:  // pred: ^bb5
+    vm.cond_fail %c18, "HAL device `__device_0` not found or unavailable: #hal.device.target<"local", [#hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>]>"
+    vm.br ^bb7
+  ^bb7:  // 2 preds: ^bb5, ^bb6
+    %_utf8_hal_executable_format_E03EECB63A2AAF52_9 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer
+    %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_10 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer
+    %16:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_E03EECB63A2AAF52_9, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_10) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
+    %nz_11 = vm.cmp.nz.i64 %16#1 : i64
+    %zero_12 = vm.const.i32.zero
+    %17 = vm.select.i32 %16#0, %nz_11, %zero_12 : i32
+    %c1_13 = vm.const.i32 1
+    %18 = vm.select.i64 %17, %zero_1, %c-1 : i64
+    %eq_14 = vm.cmp.eq.i64 %18, %zero_1 : i64
+    vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
+    vm.cond_br %eq_14, ^bb8, ^bb9
+  ^bb8:  // pred: ^bb7
+    %multiple_results_dispatch_0_embedded_elf_arm_64 = vm.const.ref.rodata @multiple_results_dispatch_0_embedded_elf_arm_64 : !vm.buffer
+    %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_15 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer
+    %null_16 = vm.const.ref.zero : !vm.buffer
+    %ref_17 = vm.call @hal.executable.create(%4, %c-1_0, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_15, %multiple_results_dispatch_0_embedded_elf_arm_64, %null_16) {nosideeffects} : (!vm.ref<!hal.device>, i64, !vm.buffer, !vm.buffer, !vm.buffer) -> !vm.ref<!hal.executable>
+    vm.br ^bb10(%ref_17 : !vm.ref<!hal.executable>)
+  ^bb9:  // pred: ^bb7
+    vm.cond_fail %c14, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+    vm.br ^bb10(%null : !vm.ref<!hal.executable>)
+  ^bb10(%19: !vm.ref<!hal.executable>):  // 2 preds: ^bb8, ^bb9
+    vm.global.store.ref %19, @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref<!hal.executable>
+    %ref_18 = vm.call @__multiple_results_memoize_apply() : () -> !vm.ref<!hal.command_buffer>
+    vm.global.store.ref %ref_18, @__multiple_results_memoize_result_0_device_0 : !vm.ref<!hal.command_buffer>
+    vm.return
+  }
+  vm.func private @__multiple_results_memoize_apply() -> !vm.ref<!hal.command_buffer> attributes {inlining_policy = #util.inline.never} {
+    %c1 = vm.const.i64 1
+    %c64 = vm.const.i32 64
+    %c128 = vm.const.i64 128
+    %c8 = vm.const.i64 8
+    %c2 = vm.const.i64 2
+    %zero = vm.const.i64.zero
+    %zero_0 = vm.const.i32.zero
+    %c3 = vm.const.i64 3
+    %c-1 = vm.const.i64 -1
+    %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref<!hal.device>
+    %__device_0_executable_0_multiple_results_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref<!hal.executable>
+    %zero_1 = vm.const.i32.zero
+    %c3_2 = vm.const.i32 3
+    %c3_3 = vm.const.i32 3
+    %ref = vm.call @hal.command_buffer.create(%__device_0, %zero_1, %c3_2, %c-1, %c3_3) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
+    %zero_4 = vm.const.i32.zero
+    %zero_5 = vm.const.i32.zero
+    %c1_6 = vm.const.i32 1
+    %c1_7 = vm.const.i32 1
+    %c1_8 = vm.const.i32 1
+    %zero_9 = vm.const.i64 0
+    %zero_10 = vm.const.i32.zero
+    %null = vm.const.ref.zero : !vm.ref<!hal.buffer>
+    %c2_11 = vm.const.i32 2
+    %null_12 = vm.const.ref.zero : !vm.ref<!hal.buffer>
+    vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_5, %c1_6, %c1_7, %c1_8, %zero_9, [%zero_0], [(%zero_4, %zero_10, %null, %zero, %c8), (%zero_4, %c2_11, %null_12, %zero, %c128)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64, i32 ..., tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
+    %zero_13 = vm.const.i32.zero
+    %zero_14 = vm.const.i32.zero
+    %c1_15 = vm.const.i32 1
+    %c1_16 = vm.const.i32 1
+    %c1_17 = vm.const.i32 1
+    %zero_18 = vm.const.i64 0
+    %c1_19 = vm.const.i32 1
+    %null_20 = vm.const.ref.zero : !vm.ref<!hal.buffer>
+    %c2_21 = vm.const.i32 2
+    %null_22 = vm.const.ref.zero : !vm.ref<!hal.buffer>
+    vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_14, %c1_15, %c1_16, %c1_17, %zero_18, [%c64], [(%zero_13, %c1_19, %null_20, %zero, %c8), (%zero_13, %c2_21, %null_22, %zero, %c128)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64, i32 ..., tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
+    %c28 = vm.const.i32 28
+    %c13 = vm.const.i32 13
+    %zero_23 = vm.const.i64.zero
+    vm.call @hal.command_buffer.execution_barrier(%ref, %c28, %c13, %zero_23) : (!vm.ref<!hal.command_buffer>, i32, i32, i64) -> ()
+    vm.call @hal.command_buffer.finalize(%ref) : (!vm.ref<!hal.command_buffer>) -> ()
+    vm.return %ref : !vm.ref<!hal.command_buffer>
+  }
+  vm.import private @hal.ex.file.from_memory(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %access : i32, %buffer : !vm.buffer, %offset : i64, %length : i64, %flags : i32) -> !vm.ref<!hal.file>
+  vm.import private @hal.allocator.select(%memory_types : i32, %buffer_usage : i32, %flags : i64, %from : tuple<!vm.ref<!hal.device>, i64> ...) -> (!vm.ref<!hal.device>, i64) attributes {nosideeffects}
+  vm.import private @hal.allocator.allocate(%allocator : !vm.ref<!hal.allocator>, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
+  vm.import private @hal.allocator.import(%allocator : !vm.ref<!hal.allocator>, %try : i32, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %source : !vm.buffer, %offset : i64, %length : i64) -> !vm.ref<!hal.buffer>
+  vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32)
+  vm.import private @hal.buffer.allocation.preserve(%buffer : !vm.ref<!hal.buffer>)
+  vm.import private @hal.buffer.allocation.discard(%buffer : !vm.ref<!hal.buffer>) -> i32
+  vm.import private @hal.buffer.allocation.is_terminal(%buffer : !vm.ref<!hal.buffer>) -> i32
+  vm.import private @hal.buffer.subspan(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %length : i64) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
+  vm.import private @hal.buffer.length(%buffer : !vm.ref<!hal.buffer>) -> i64 attributes {nosideeffects}
+  vm.import private @hal.buffer.load(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %length : i32) -> i32
+  vm.import private @hal.buffer.store(%value : i32, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i32)
+  vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects}
+  vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...)
+  vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
+  vm.import private @hal.buffer_view.element_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects}
+  vm.import private @hal.buffer_view.encoding_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects}
+  vm.import private @hal.buffer_view.rank(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects}
+  vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i64 attributes {nosideeffects}
+  vm.import private @hal.buffer_view.trace(%key : !vm.buffer, %operands : !vm.ref<!hal.buffer_view> ...)
+  vm.import private @hal.channel.create(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %flags : i64, %id : !vm.buffer, %group : !vm.buffer, %rank : i32, %count : i32) -> !vm.ref<!hal.channel> attributes {nosideeffects}
+  vm.import private @hal.channel.split(%channel : !vm.ref<!hal.channel>, %color : i32, %key : i32, %flags : i64) -> !vm.ref<!hal.channel> attributes {nosideeffects}
+  vm.import private @hal.channel.rank_and_count(%channel : !vm.ref<!hal.channel>) -> (i32, i32) attributes {nosideeffects}
+  vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 6 : i32}
+  vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>)
+  vm.import private @hal.command_buffer.begin_debug_group(%command_buffer : !vm.ref<!hal.command_buffer>, %label : !vm.buffer)
+  vm.import private @hal.command_buffer.end_debug_group(%command_buffer : !vm.ref<!hal.command_buffer>)
+  vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i64)
+  vm.import private @hal.command_buffer.advise_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %buffer : !vm.ref<!hal.buffer>, %flags : i64, %arg0 : i64, %arg1 : i64, %buffer_slot : i32)
+  vm.import private @hal.command_buffer.fill_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %target_buffer_slot : i32, %pattern : i64, %pattern_length : i32, %flags : i64)
+  vm.import private @hal.command_buffer.update_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %source_buffer : !vm.buffer, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %target_buffer_slot : i32, %flags : i64)
+  vm.import private @hal.command_buffer.copy_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %source_buffer_slot : i32, %target_buffer_slot : i32, %source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %flags : i64)
+  vm.import private @hal.command_buffer.collective(%command_buffer : !vm.ref<!hal.command_buffer>, %channel : !vm.ref<!hal.channel>, %op : i32, %param : i32, %send_buffer_slot : i32, %recv_buffer_slot : i32, %send_buffer : !vm.ref<!hal.buffer>, %recv_buffer : !vm.ref<!hal.buffer>, %send_offset : i64, %send_length : i64, %recv_offset : i64, %recv_length : i64, %element_count : i64)
+  vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
+  vm.import private @hal.command_buffer.dispatch.indirect(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref<!hal.buffer>, %workgroups_offset : i64, %flags : i64, %constants : i32 ..., %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
+  vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects}
+  vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects}
+  vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64, %flags : i64) -> !vm.ref<!hal.buffer>
+  vm.import private @hal.device.queue.dealloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %buffer : !vm.ref<!hal.buffer>, %flags : i64)
+  vm.import private @hal.device.queue.fill(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %pattern : i64, %pattern_length : i32, %flags : i64)
+  vm.import private @hal.device.queue.update(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %source_buffer : !vm.buffer, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %flags : i64)
+  vm.import private @hal.device.queue.copy(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %flags : i64)
+  vm.import private @hal.device.queue.read(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %source_file : !vm.ref<!hal.file>, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %flags : i64)
+  vm.import private @hal.device.queue.write(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %target_file : !vm.ref<!hal.file>, %target_offset : i64, %length : i64, %flags : i64)
+  vm.import private @hal.device.queue.barrier(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %flags : i64)
+  vm.import private @hal.device.queue.execute(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffer : !vm.ref<!hal.command_buffer>, %flags : i64)
+  vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffer : !vm.ref<!hal.command_buffer>, %flags : i64, %binding_table : tuple<!vm.ref<!hal.buffer>, i64, i64> ...)
+  vm.import private @hal.device.queue.flush(%device : !vm.ref<!hal.device>, %queue_affinity : i64)
+  vm.import private @hal.devices.count() -> i32 attributes {nosideeffects}
+  vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects}
+  vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref<!hal.executable> attributes {nosideeffects}
+  vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i64) -> !vm.ref<!hal.fence>
+  vm.import private @hal.fence.join(%flags : i64, %fences : !vm.ref<!hal.fence> ...) -> !vm.ref<!hal.fence> attributes {nosideeffects}
+  vm.import private @hal.fence.query(%fence : !vm.ref<!hal.fence>) -> i32
+  vm.import private @hal.fence.signal(%fence : !vm.ref<!hal.fence>)
+  vm.import private @hal.fence.fail(%fence : !vm.ref<!hal.fence>, %status : i32)
+  vm.import private @hal.fence.await(%timeout_millis : i32, %flags : i64, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {vm.yield}
+  vm.rodata private @_utf8_input0_DCE99660CEB3F6B {alignment = 1 : i64} "input0"
+  vm.rodata private @_utf8_tensor_FC1814BC4A58F22A {alignment = 1 : i64} "tensor"
+  vm.rodata private @_utf8_input1_B898B726583C85DA {alignment = 1 : i64} "input1"
+  vm.func private @multiple_results(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> (!vm.ref<!hal.buffer_view>, !vm.ref<!hal.buffer_view>) attributes {iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c3075 = vm.const.i32 3075
+    %c48 = vm.const.i32 48
+    %c2 = vm.const.i64 2
+    %c8 = vm.const.i64 8
+    %c64 = vm.const.i64 64
+    %c128 = vm.const.i64 128
+    %zero = vm.const.i64.zero
+    %c-1 = vm.const.i64 -1
+    %null = vm.const.ref.zero : !vm.ref<!hal.fence>
+    %zero_0 = vm.const.i64.zero
+    %c-1_1 = vm.const.i32 -1
+    %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref<!hal.device>
+    %__multiple_results_memoize_result_0_device_0 = vm.global.load.ref immutable @__multiple_results_memoize_result_0_device_0 : !vm.ref<!hal.command_buffer>
+    %c553648160 = vm.const.i32 553648160
+    %c1 = vm.const.i32 1
+    %_utf8_input0_DCE99660CEB3F6B = vm.const.ref.rodata @_utf8_input0_DCE99660CEB3F6B : !vm.buffer
+    vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DCE99660CEB3F6B, %c553648160, %c1, [%c2]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
+    %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
+    %ref_2 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
+    %_utf8_tensor_FC1814BC4A58F22A = vm.const.ref.rodata @_utf8_tensor_FC1814BC4A58F22A : !vm.buffer
+    %c16 = vm.const.i32 16
+    %c3075_3 = vm.const.i32 3075
+    vm.call @hal.buffer.assert(%ref, %_utf8_tensor_FC1814BC4A58F22A, %ref_2, %c8, %c16, %c3075_3) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
+    %_utf8_input1_B898B726583C85DA = vm.const.ref.rodata @_utf8_input1_B898B726583C85DA : !vm.buffer
+    vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_B898B726583C85DA, %c553648160, %c1, [%c2]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
+    %ref_4 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
+    %_utf8_tensor_FC1814BC4A58F22A_5 = vm.const.ref.rodata @_utf8_tensor_FC1814BC4A58F22A : !vm.buffer
+    %c16_6 = vm.const.i32 16
+    %c3075_7 = vm.const.i32 3075
+    vm.call @hal.buffer.assert(%ref_4, %_utf8_tensor_FC1814BC4A58F22A_5, %ref_2, %c8, %c16_6, %c3075_7) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
+    %zero_8 = vm.const.i64.zero
+    %ref_9 = vm.call @hal.fence.create(%__device_0, %zero_8) : (!vm.ref<!hal.device>, i64) -> !vm.ref<!hal.fence>
+    %zero_10 = vm.const.i64.zero
+    %ref_11 = vm.call @hal.device.queue.alloca(%__device_0, %c-1, %null, %ref_9, %zero_0, %c48, %c3075, %c128, %zero_10) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i64, i32, i32, i64, i64) -> !vm.ref<!hal.buffer>
+    %zero_12 = vm.const.i64.zero
+    %ref_13 = vm.call @hal.fence.create(%__device_0, %zero_12) : (!vm.ref<!hal.device>, i64) -> !vm.ref<!hal.fence>
+    %zero_14 = vm.const.i64 0
+    vm.call.variadic @hal.device.queue.execute.indirect(%__device_0, %c-1, %ref_9, %ref_13, %__multiple_results_memoize_result_0_device_0, %zero_14, [(%ref, %zero, %c8), (%ref_4, %zero, %c8), (%ref_11, %zero, %c128)]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer>, i64, tuple<!vm.ref<!hal.buffer>, i64, i64> ...)
+    %zero_15 = vm.const.i64.zero
+    %0 = vm.call.variadic @hal.fence.await(%c-1_1, %zero_15, [%ref_13]) : (i32, i64, !vm.ref<!hal.fence> ...) -> i32
+    vm.cond_fail %0, "failed to wait on timepoint"
+    %ref_16 = vm.call.variadic @hal.buffer_view.create(%ref_11, %zero, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
+    %ref_17 = vm.call.variadic @hal.buffer_view.create(%ref_11, %c64, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
+    vm.return %ref_16, %ref_17 : !vm.ref<!hal.buffer_view>, !vm.ref<!hal.buffer_view>
+  }
+  vm.export @multiple_results attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}}
+}
+
+// -----// IR Dump After DropUnusedCallsPass (iree-vm-drop-unused-calls) //----- //
+vm.module public @module {
+  vm.global.ref private @__device_0 : !vm.ref<!hal.device>
+  vm.global.ref private @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref<!hal.executable>
+  vm.global.ref private @__multiple_results_memoize_result_0_device_0 : !vm.ref<!hal.command_buffer>
+  vm.rodata private @_utf8_hal_device_id_C6650FF277232B5A {alignment = 1 : i64} "hal.device.id"
+  vm.rodata private @_utf8_local_1A8FF0278D7661D8 {alignment = 1 : i64} "local*"
+  vm.rodata private @_utf8_hal_executable_format_E03EECB63A2AAF52 {alignment = 1 : i64} "hal.executable.format"
+  vm.rodata private @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 {alignment = 1 : i64} "embedded-elf-arm_64"
+  vm.rodata private @multiple_results_dispatch_0_embedded_elf_arm_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8>
+  vm.initializer {
+    %null = vm.const.ref.zero : !vm.buffer
+    %null_0 = vm.const.ref.zero : !vm.ref<!hal.executable>
+    %c14 = vm.const.i32 14
+    %c-1 = vm.const.i64 -1
+    %c18 = vm.const.i32 18
+    %zero = vm.const.i32.zero
+    %zero_1 = vm.const.i64.zero
+    %c1 = vm.const.i64 1
+    %null_2 = vm.const.ref.zero : !vm.ref<!hal.device>
+    %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
+    %1 = vm.ext.i32.i64.s %0 : i32 -> i64
+    vm.br ^bb1(%zero_1, %zero_1, %null_2 : i64, i64, !vm.ref<!hal.device>)
+  ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
+    %req = vm.cmp.eq.ref %4, %null_2 : !vm.ref<!hal.device>
+    %slt = vm.cmp.lt.i64.s %2, %1 : i64
+    %5 = vm.and.i32 %req, %slt : i32
+    vm.cond_br %5, ^bb2, ^bb5
+  ^bb2:  // pred: ^bb1
+    %6 = vm.trunc.i64.i32 %2 : i64 -> i32
+    %ref = vm.call @hal.devices.get(%6) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
+    %_utf8_hal_device_id_C6650FF277232B5A = vm.const.ref.rodata @_utf8_hal_device_id_C6650FF277232B5A : !vm.buffer
+    %_utf8_local_1A8FF0278D7661D8 = vm.const.ref.rodata @_utf8_local_1A8FF0278D7661D8 : !vm.buffer
+    %7:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_C6650FF277232B5A, %_utf8_local_1A8FF0278D7661D8) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
+    %nz = vm.cmp.nz.i64 %7#1 : i64
+    %8 = vm.select.i32 %7#0, %nz, %zero : i32
+    vm.cond_br %8, ^bb3, ^bb4(%zero : i32)
+  ^bb3:  // pred: ^bb2
+    %_utf8_hal_executable_format_E03EECB63A2AAF52 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer
+    %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer
+    %9:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_E03EECB63A2AAF52, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
+    %nz_3 = vm.cmp.nz.i64 %9#1 : i64
+    %10 = vm.select.i32 %9#0, %nz_3, %zero : i32
+    vm.br ^bb4(%10 : i32)
+  ^bb4(%11: i32):  // 2 preds: ^bb2, ^bb3
+    %eq = vm.cmp.eq.i64 %3, %zero_1 : i64
+    %12 = vm.select.i64 %11, %c1, %zero_1 : i64
+    %13 = vm.add.i64 %3, %12 : i64
+    %14 = vm.and.i32 %11, %eq : i32
+    %ref_4 = vm.select.ref %14, %ref, %null_2 : !vm.ref<!hal.device>
+    %15 = vm.add.i64 %2, %c1 : i64
+    vm.br ^bb1(%15, %13, %ref_4 : i64, i64, !vm.ref<!hal.device>)
+  ^bb5:  // pred: ^bb1
+    vm.cond_br %req, ^bb6, ^bb7
+  ^bb6:  // pred: ^bb5
+    vm.cond_fail %c18, "HAL device `__device_0` not found or unavailable: #hal.device.target<"local", [#hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>]>"
+    vm.br ^bb7
+  ^bb7:  // 2 preds: ^bb5, ^bb6
+    %_utf8_hal_executable_format_E03EECB63A2AAF52_5 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer
+    %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer
+    %16:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_E03EECB63A2AAF52_5, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
+    %nz_7 = vm.cmp.nz.i64 %16#1 : i64
+    %17 = vm.select.i32 %16#0, %nz_7, %zero : i32
+    %18 = vm.select.i64 %17, %zero_1, %c-1 : i64
+    %eq_8 = vm.cmp.eq.i64 %18, %zero_1 : i64
+    vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
+    vm.cond_br %eq_8, ^bb8, ^bb9
+  ^bb8:  // pred: ^bb7
+    %multiple_results_dispatch_0_embedded_elf_arm_64 = vm.const.ref.rodata @multiple_results_dispatch_0_embedded_elf_arm_64 : !vm.buffer
+    %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_9 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer
+    %ref_10 = vm.call @hal.executable.create(%4, %c-1, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_9, %multiple_results_dispatch_0_embedded_elf_arm_64, %null) {nosideeffects} : (!vm.ref<!hal.device>, i64, !vm.buffer, !vm.buffer, !vm.buffer) -> !vm.ref<!hal.executable>
+    vm.br ^bb10(%ref_10 : !vm.ref<!hal.executable>)
+  ^bb9:  // pred: ^bb7
+    vm.cond_fail %c14, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+    vm.br ^bb10(%null_0 : !vm.ref<!hal.executable>)
+  ^bb10(%19: !vm.ref<!hal.executable>):  // 2 preds: ^bb8, ^bb9
+    vm.global.store.ref %19, @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref<!hal.executable>
+    %ref_11 = vm.call @__multiple_results_memoize_apply() : () -> !vm.ref<!hal.command_buffer>
+    vm.global.store.ref %ref_11, @__multiple_results_memoize_result_0_device_0 : !vm.ref<!hal.command_buffer>
+    vm.return
+  }
+  vm.func private @__multiple_results_memoize_apply() -> !vm.ref<!hal.command_buffer> attributes {inlining_policy = #util.inline.never} {
+    %c13 = vm.const.i32 13
+    %c28 = vm.const.i32 28
+    %c2 = vm.const.i32 2
+    %null = vm.const.ref.zero : !vm.ref<!hal.buffer>
+    %c1 = vm.const.i32 1
+    %c3 = vm.const.i32 3
+    %c64 = vm.const.i32 64
+    %c128 = vm.const.i64 128
+    %c8 = vm.const.i64 8
+    %zero = vm.const.i64.zero
+    %zero_0 = vm.const.i32.zero
+    %c-1 = vm.const.i64 -1
+    %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref<!hal.device>
+    %__device_0_executable_0_multiple_results_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref<!hal.executable>
+    %ref = vm.call @hal.command_buffer.create(%__device_0, %zero_0, %c3, %c-1, %c3) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
+    vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%zero_0], [(%zero_0, %zero_0, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64, i32 ..., tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
+    vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%c64], [(%zero_0, %c1, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64, i32 ..., tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
+    vm.call @hal.command_buffer.execution_barrier(%ref, %c28, %c13, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i64) -> ()
+    vm.call @hal.command_buffer.finalize(%ref) : (!vm.ref<!hal.command_buffer>) -> ()
+    vm.return %ref : !vm.ref<!hal.command_buffer>
+  }
+  vm.import private @hal.ex.file.from_memory(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %access : i32, %buffer : !vm.buffer, %offset : i64, %length : i64, %flags : i32) -> !vm.ref<!hal.file>
+  vm.import private @hal.allocator.select(%memory_types : i32, %buffer_usage : i32, %flags : i64, %from : tuple<!vm.ref<!hal.device>, i64> ...) -> (!vm.ref<!hal.device>, i64) attributes {nosideeffects}
+  vm.import private @hal.allocator.allocate(%allocator : !vm.ref<!hal.allocator>, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer>
+  vm.import private @hal.allocator.import(%allocator : !vm.ref<!hal.allocator>, %try : i32, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %source : !vm.buffer, %offset : i64, %length : i64) -> !vm.ref<!hal.buffer>
+  vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32)
+  vm.import private @hal.buffer.allocation.preserve(%buffer : !vm.ref<!hal.buffer>)
+  vm.import private @hal.buffer.allocation.discard(%buffer : !vm.ref<!hal.buffer>) -> i32
+  vm.import private @hal.buffer.allocation.is_terminal(%buffer : !vm.ref<!hal.buffer>) -> i32
+  vm.import private @hal.buffer.subspan(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %length : i64) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
+  vm.import private @hal.buffer.length(%buffer : !vm.ref<!hal.buffer>) -> i64 attributes {nosideeffects}
+  vm.import private @hal.buffer.load(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %length : i32) -> i32
+  vm.import private @hal.buffer.store(%value : i32, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i32)
+  vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects}
+  vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...)
+  vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
+  vm.import private @hal.buffer_view.element_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects}
+  vm.import private @hal.buffer_view.encoding_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects}
+  vm.import private @hal.buffer_view.rank(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects}
+  vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i64 attributes {nosideeffects}
+  vm.import private @hal.buffer_view.trace(%key : !vm.buffer, %operands : !vm.ref<!hal.buffer_view> ...)
+  vm.import private @hal.channel.create(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %flags : i64, %id : !vm.buffer, %group : !vm.buffer, %rank : i32, %count : i32) -> !vm.ref<!hal.channel> attributes {nosideeffects}
+  vm.import private @hal.channel.split(%channel : !vm.ref<!hal.channel>, %color : i32, %key : i32, %flags : i64) -> !vm.ref<!hal.channel> attributes {nosideeffects}
+  vm.import private @hal.channel.rank_and_count(%channel : !vm.ref<!hal.channel>) -> (i32, i32) attributes {nosideeffects}
+  vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 6 : i32}
+  vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>)
+  vm.import private @hal.command_buffer.begin_debug_group(%command_buffer : !vm.ref<!hal.command_buffer>, %label : !vm.buffer)
+  vm.import private @hal.command_buffer.end_debug_group(%command_buffer : !vm.ref<!hal.command_buffer>)
+  vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i64)
+  vm.import private @hal.command_buffer.advise_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %buffer : !vm.ref<!hal.buffer>, %flags : i64, %arg0 : i64, %arg1 : i64, %buffer_slot : i32)
+  vm.import private @hal.command_buffer.fill_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %target_buffer_slot : i32, %pattern : i64, %pattern_length : i32, %flags : i64)
+  vm.import private @hal.command_buffer.update_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %source_buffer : !vm.buffer, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %target_buffer_slot : i32, %flags : i64)
+  vm.import private @hal.command_buffer.copy_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %source_buffer_slot : i32, %target_buffer_slot : i32, %source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %flags : i64)
+  vm.import private @hal.command_buffer.collective(%command_buffer : !vm.ref<!hal.command_buffer>, %channel : !vm.ref<!hal.channel>, %op : i32, %param : i32, %send_buffer_slot : i32, %recv_buffer_slot : i32, %send_buffer : !vm.ref<!hal.buffer>, %recv_buffer : !vm.ref<!hal.buffer>, %send_offset : i64, %send_length : i64, %recv_offset : i64, %recv_length : i64, %element_count : i64)
+  vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
+  vm.import private @hal.command_buffer.dispatch.indirect(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref<!hal.buffer>, %workgroups_offset : i64, %flags : i64, %constants : i32 ..., %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
+  vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects}
+  vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects}
+  vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64, %flags : i64) -> !vm.ref<!hal.buffer>
+  vm.import private @hal.device.queue.dealloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %buffer : !vm.ref<!hal.buffer>, %flags : i64)
+  vm.import private @hal.device.queue.fill(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %pattern : i64, %pattern_length : i32, %flags : i64)
+  vm.import private @hal.device.queue.update(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %source_buffer : !vm.buffer, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %flags : i64)
+  vm.import private @hal.device.queue.copy(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %flags : i64)
+  vm.import private @hal.device.queue.read(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %source_file : !vm.ref<!hal.file>, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %flags : i64)
+  vm.import private @hal.device.queue.write(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %target_file : !vm.ref<!hal.file>, %target_offset : i64, %length : i64, %flags : i64)
+  vm.import private @hal.device.queue.barrier(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %flags : i64)
+  vm.import private @hal.device.queue.execute(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffer : !vm.ref<!hal.command_buffer>, %flags : i64)
+  vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffer : !vm.ref<!hal.command_buffer>, %flags : i64, %binding_table : tuple<!vm.ref<!hal.buffer>, i64, i64> ...)
+  vm.import private @hal.device.queue.flush(%device : !vm.ref<!hal.device>, %queue_affinity : i64)
+  vm.import private @hal.devices.count() -> i32 attributes {nosideeffects}
+  vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects}
+  vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref<!hal.executable> attributes {nosideeffects}
+  vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i64) -> !vm.ref<!hal.fence>
+  vm.import private @hal.fence.join(%flags : i64, %fences : !vm.ref<!hal.fence> ...) -> !vm.ref<!hal.fence> attributes {nosideeffects}
+  vm.import private @hal.fence.query(%fence : !vm.ref<!hal.fence>) -> i32
+  vm.import private @hal.fence.signal(%fence : !vm.ref<!hal.fence>)
+  vm.import private @hal.fence.fail(%fence : !vm.ref<!hal.fence>, %status : i32)
+  vm.import private @hal.fence.await(%timeout_millis : i32, %flags : i64, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {vm.yield}
+  vm.rodata private @_utf8_input0_DCE99660CEB3F6B {alignment = 1 : i64} "input0"
+  vm.rodata private @_utf8_tensor_FC1814BC4A58F22A {alignment = 1 : i64} "tensor"
+  vm.rodata private @_utf8_input1_B898B726583C85DA {alignment = 1 : i64} "input1"
+  vm.func private @multiple_results(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> (!vm.ref<!hal.buffer_view>, !vm.ref<!hal.buffer_view>) attributes {iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c16 = vm.const.i32 16
+    %c1 = vm.const.i32 1
+    %c553648160 = vm.const.i32 553648160
+    %c3075 = vm.const.i32 3075
+    %c48 = vm.const.i32 48
+    %c2 = vm.const.i64 2
+    %c8 = vm.const.i64 8
+    %c64 = vm.const.i64 64
+    %c128 = vm.const.i64 128
+    %zero = vm.const.i64.zero
+    %c-1 = vm.const.i64 -1
+    %null = vm.const.ref.zero : !vm.ref<!hal.fence>
+    %c-1_0 = vm.const.i32 -1
+    %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref<!hal.device>
+    %__multiple_results_memoize_result_0_device_0 = vm.global.load.ref immutable @__multiple_results_memoize_result_0_device_0 : !vm.ref<!hal.command_buffer>
+    %_utf8_input0_DCE99660CEB3F6B = vm.const.ref.rodata @_utf8_input0_DCE99660CEB3F6B : !vm.buffer
+    vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DCE99660CEB3F6B, %c553648160, %c1, [%c2]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
+    %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
+    %ref_1 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
+    %_utf8_tensor_FC1814BC4A58F22A = vm.const.ref.rodata @_utf8_tensor_FC1814BC4A58F22A : !vm.buffer
+    vm.call @hal.buffer.assert(%ref, %_utf8_tensor_FC1814BC4A58F22A, %ref_1, %c8, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
+    %_utf8_input1_B898B726583C85DA = vm.const.ref.rodata @_utf8_input1_B898B726583C85DA : !vm.buffer
+    vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_B898B726583C85DA, %c553648160, %c1, [%c2]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
+    %ref_2 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
+    %_utf8_tensor_FC1814BC4A58F22A_3 = vm.const.ref.rodata @_utf8_tensor_FC1814BC4A58F22A : !vm.buffer
+    vm.call @hal.buffer.assert(%ref_2, %_utf8_tensor_FC1814BC4A58F22A_3, %ref_1, %c8, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
+    %ref_4 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i64) -> !vm.ref<!hal.fence>
+    %ref_5 = vm.call @hal.device.queue.alloca(%__device_0, %c-1, %null, %ref_4, %zero, %c48, %c3075, %c128, %zero) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i64, i32, i32, i64, i64) -> !vm.ref<!hal.buffer>
+    %ref_6 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i64) -> !vm.ref<!hal.fence>
+    vm.call.variadic @hal.device.queue.execute.indirect(%__device_0, %c-1, %ref_4, %ref_6, %__multiple_results_memoize_result_0_device_0, %zero, [(%ref, %zero, %c8), (%ref_2, %zero, %c8), (%ref_5, %zero, %c128)]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer>, i64, tuple<!vm.ref<!hal.buffer>, i64, i64> ...)
+    %0 = vm.call.variadic @hal.fence.await(%c-1_0, %zero, [%ref_6]) : (i32, i64, !vm.ref<!hal.fence> ...) -> i32
+    vm.cond_fail %0, "failed to wait on timepoint"
+    %ref_7 = vm.call.variadic @hal.buffer_view.create(%ref_5, %zero, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
+    %ref_8 = vm.call.variadic @hal.buffer_view.create(%ref_5, %c64, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
+    vm.return %ref_7, %ref_8 : !vm.ref<!hal.buffer_view>, !vm.ref<!hal.buffer_view>
+  }
+  vm.export @multiple_results attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}}
+}
+
+// -----// IR Dump After SymbolDCE (symbol-dce) //----- //
+module attributes {vm.toplevel} {
+  vm.module public @module {
+    vm.global.ref private @__device_0 : !vm.ref<!hal.device>
+    vm.global.ref private @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref<!hal.executable>
+    vm.global.ref private @__multiple_results_memoize_result_0_device_0 : !vm.ref<!hal.command_buffer>
+    vm.rodata private @_utf8_hal_device_id_C6650FF277232B5A {alignment = 1 : i64} "hal.device.id"
+    vm.rodata private @_utf8_local_1A8FF0278D7661D8 {alignment = 1 : i64} "local*"
+    vm.rodata private @_utf8_hal_executable_format_E03EECB63A2AAF52 {alignment = 1 : i64} "hal.executable.format"
+    vm.rodata private @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 {alignment = 1 : i64} "embedded-elf-arm_64"
+    vm.rodata private @multiple_results_dispatch_0_embedded_elf_arm_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8>
+    vm.initializer {
+      %null = vm.const.ref.zero : !vm.buffer
+      %null_0 = vm.const.ref.zero : !vm.ref<!hal.executable>
+      %c14 = vm.const.i32 14
+      %c-1 = vm.const.i64 -1
+      %c18 = vm.const.i32 18
+      %zero = vm.const.i32.zero
+      %zero_1 = vm.const.i64.zero
+      %c1 = vm.const.i64 1
+      %null_2 = vm.const.ref.zero : !vm.ref<!hal.device>
+      %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
+      %1 = vm.ext.i32.i64.s %0 : i32 -> i64
+      vm.br ^bb1(%zero_1, %zero_1, %null_2 : i64, i64, !vm.ref<!hal.device>)
+    ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
+      %req = vm.cmp.eq.ref %4, %null_2 : !vm.ref<!hal.device>
+      %slt = vm.cmp.lt.i64.s %2, %1 : i64
+      %5 = vm.and.i32 %req, %slt : i32
+      vm.cond_br %5, ^bb2, ^bb5
+    ^bb2:  // pred: ^bb1
+      %6 = vm.trunc.i64.i32 %2 : i64 -> i32
+      %ref = vm.call @hal.devices.get(%6) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
+      %_utf8_hal_device_id_C6650FF277232B5A = vm.const.ref.rodata @_utf8_hal_device_id_C6650FF277232B5A : !vm.buffer
+      %_utf8_local_1A8FF0278D7661D8 = vm.const.ref.rodata @_utf8_local_1A8FF0278D7661D8 : !vm.buffer
+      %7:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_C6650FF277232B5A, %_utf8_local_1A8FF0278D7661D8) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
+      %nz = vm.cmp.nz.i64 %7#1 : i64
+      %8 = vm.select.i32 %7#0, %nz, %zero : i32
+      vm.cond_br %8, ^bb3, ^bb4(%zero : i32)
+    ^bb3:  // pred: ^bb2
+      %_utf8_hal_executable_format_E03EECB63A2AAF52 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer
+      %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer
+      %9:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_E03EECB63A2AAF52, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
+      %nz_3 = vm.cmp.nz.i64 %9#1 : i64
+      %10 = vm.select.i32 %9#0, %nz_3, %zero : i32
+      vm.br ^bb4(%10 : i32)
+    ^bb4(%11: i32):  // 2 preds: ^bb2, ^bb3
+      %eq = vm.cmp.eq.i64 %3, %zero_1 : i64
+      %12 = vm.select.i64 %11, %c1, %zero_1 : i64
+      %13 = vm.add.i64 %3, %12 : i64
+      %14 = vm.and.i32 %11, %eq : i32
+      %ref_4 = vm.select.ref %14, %ref, %null_2 : !vm.ref<!hal.device>
+      %15 = vm.add.i64 %2, %c1 : i64
+      vm.br ^bb1(%15, %13, %ref_4 : i64, i64, !vm.ref<!hal.device>)
+    ^bb5:  // pred: ^bb1
+      vm.cond_br %req, ^bb6, ^bb7
+    ^bb6:  // pred: ^bb5
+      vm.cond_fail %c18, "HAL device `__device_0` not found or unavailable: #hal.device.target<"local", [#hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>]>"
+      vm.br ^bb7
+    ^bb7:  // 2 preds: ^bb5, ^bb6
+      %_utf8_hal_executable_format_E03EECB63A2AAF52_5 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer
+      %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer
+      %16:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_E03EECB63A2AAF52_5, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
+      %nz_7 = vm.cmp.nz.i64 %16#1 : i64
+      %17 = vm.select.i32 %16#0, %nz_7, %zero : i32
+      %18 = vm.select.i64 %17, %zero_1, %c-1 : i64
+      %eq_8 = vm.cmp.eq.i64 %18, %zero_1 : i64
+      vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
+      vm.cond_br %eq_8, ^bb8, ^bb9
+    ^bb8:  // pred: ^bb7
+      %multiple_results_dispatch_0_embedded_elf_arm_64 = vm.const.ref.rodata @multiple_results_dispatch_0_embedded_elf_arm_64 : !vm.buffer
+      %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_9 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer
+      %ref_10 = vm.call @hal.executable.create(%4, %c-1, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_9, %multiple_results_dispatch_0_embedded_elf_arm_64, %null) {nosideeffects} : (!vm.ref<!hal.device>, i64, !vm.buffer, !vm.buffer, !vm.buffer) -> !vm.ref<!hal.executable>
+      vm.br ^bb10(%ref_10 : !vm.ref<!hal.executable>)
+    ^bb9:  // pred: ^bb7
+      vm.cond_fail %c14, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+      vm.br ^bb10(%null_0 : !vm.ref<!hal.executable>)
+    ^bb10(%19: !vm.ref<!hal.executable>):  // 2 preds: ^bb8, ^bb9
+      vm.global.store.ref %19, @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref<!hal.executable>
+      %ref_11 = vm.call @__multiple_results_memoize_apply() : () -> !vm.ref<!hal.command_buffer>
+      vm.global.store.ref %ref_11, @__multiple_results_memoize_result_0_device_0 : !vm.ref<!hal.command_buffer>
+      vm.return
+    }
+    vm.func private @__multiple_results_memoize_apply() -> !vm.ref<!hal.command_buffer> attributes {inlining_policy = #util.inline.never} {
+      %c13 = vm.const.i32 13
+      %c28 = vm.const.i32 28
+      %c2 = vm.const.i32 2
+      %null = vm.const.ref.zero : !vm.ref<!hal.buffer>
+      %c1 = vm.const.i32 1
+      %c3 = vm.const.i32 3
+      %c64 = vm.const.i32 64
+      %c128 = vm.const.i64 128
+      %c8 = vm.const.i64 8
+      %zero = vm.const.i64.zero
+      %zero_0 = vm.const.i32.zero
+      %c-1 = vm.const.i64 -1
+      %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref<!hal.device>
+      %__device_0_executable_0_multiple_results_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref<!hal.executable>
+      %ref = vm.call @hal.command_buffer.create(%__device_0, %zero_0, %c3, %c-1, %c3) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
+      vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%zero_0], [(%zero_0, %zero_0, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64, i32 ..., tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
+      vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%c64], [(%zero_0, %c1, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64, i32 ..., tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
+      vm.call @hal.command_buffer.execution_barrier(%ref, %c28, %c13, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i64) -> ()
+      vm.call @hal.command_buffer.finalize(%ref) : (!vm.ref<!hal.command_buffer>) -> ()
+      vm.return %ref : !vm.ref<!hal.command_buffer>
+    }
+    vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32)
+    vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects}
+    vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...)
+    vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
+    vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 6 : i32}
+    vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>)
+    vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i64)
+    vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
+    vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects}
+    vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects}
+    vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64, %flags : i64) -> !vm.ref<!hal.buffer>
+    vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffer : !vm.ref<!hal.command_buffer>, %flags : i64, %binding_table : tuple<!vm.ref<!hal.buffer>, i64, i64> ...)
+    vm.import private @hal.devices.count() -> i32 attributes {nosideeffects}
+    vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects}
+    vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref<!hal.executable> attributes {nosideeffects}
+    vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i64) -> !vm.ref<!hal.fence>
+    vm.import private @hal.fence.await(%timeout_millis : i32, %flags : i64, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {vm.yield}
+    vm.rodata private @_utf8_input0_DCE99660CEB3F6B {alignment = 1 : i64} "input0"
+    vm.rodata private @_utf8_tensor_FC1814BC4A58F22A {alignment = 1 : i64} "tensor"
+    vm.rodata private @_utf8_input1_B898B726583C85DA {alignment = 1 : i64} "input1"
+    vm.func private @multiple_results(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> (!vm.ref<!hal.buffer_view>, !vm.ref<!hal.buffer_view>) attributes {iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+      %c16 = vm.const.i32 16
+      %c1 = vm.const.i32 1
+      %c553648160 = vm.const.i32 553648160
+      %c3075 = vm.const.i32 3075
+      %c48 = vm.const.i32 48
+      %c2 = vm.const.i64 2
+      %c8 = vm.const.i64 8
+      %c64 = vm.const.i64 64
+      %c128 = vm.const.i64 128
+      %zero = vm.const.i64.zero
+      %c-1 = vm.const.i64 -1
+      %null = vm.const.ref.zero : !vm.ref<!hal.fence>
+      %c-1_0 = vm.const.i32 -1
+      %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref<!hal.device>
+      %__multiple_results_memoize_result_0_device_0 = vm.global.load.ref immutable @__multiple_results_memoize_result_0_device_0 : !vm.ref<!hal.command_buffer>
+      %_utf8_input0_DCE99660CEB3F6B = vm.const.ref.rodata @_utf8_input0_DCE99660CEB3F6B : !vm.buffer
+      vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DCE99660CEB3F6B, %c553648160, %c1, [%c2]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
+      %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
+      %ref_1 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
+      %_utf8_tensor_FC1814BC4A58F22A = vm.const.ref.rodata @_utf8_tensor_FC1814BC4A58F22A : !vm.buffer
+      vm.call @hal.buffer.assert(%ref, %_utf8_tensor_FC1814BC4A58F22A, %ref_1, %c8, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
+      %_utf8_input1_B898B726583C85DA = vm.const.ref.rodata @_utf8_input1_B898B726583C85DA : !vm.buffer
+      vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_B898B726583C85DA, %c553648160, %c1, [%c2]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
+      %ref_2 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
+      %_utf8_tensor_FC1814BC4A58F22A_3 = vm.const.ref.rodata @_utf8_tensor_FC1814BC4A58F22A : !vm.buffer
+      vm.call @hal.buffer.assert(%ref_2, %_utf8_tensor_FC1814BC4A58F22A_3, %ref_1, %c8, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
+      %ref_4 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i64) -> !vm.ref<!hal.fence>
+      %ref_5 = vm.call @hal.device.queue.alloca(%__device_0, %c-1, %null, %ref_4, %zero, %c48, %c3075, %c128, %zero) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i64, i32, i32, i64, i64) -> !vm.ref<!hal.buffer>
+      %ref_6 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i64) -> !vm.ref<!hal.fence>
+      vm.call.variadic @hal.device.queue.execute.indirect(%__device_0, %c-1, %ref_4, %ref_6, %__multiple_results_memoize_result_0_device_0, %zero, [(%ref, %zero, %c8), (%ref_2, %zero, %c8), (%ref_5, %zero, %c128)]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer>, i64, tuple<!vm.ref<!hal.buffer>, i64, i64> ...)
+      %0 = vm.call.variadic @hal.fence.await(%c-1_0, %zero, [%ref_6]) : (i32, i64, !vm.ref<!hal.fence> ...) -> i32
+      vm.cond_fail %0, "failed to wait on timepoint"
+      %ref_7 = vm.call.variadic @hal.buffer_view.create(%ref_5, %zero, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
+      %ref_8 = vm.call.variadic @hal.buffer_view.create(%ref_5, %c64, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
+      vm.return %ref_7, %ref_8 : !vm.ref<!hal.buffer_view>, !vm.ref<!hal.buffer_view>
+    }
+    vm.export @multiple_results attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}}
+  }
+}
+
+
+// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- //
+module attributes {vm.toplevel} {
+  vm.module public @module {
+    vm.global.ref private @__device_0 : !vm.ref<!hal.device>
+    vm.global.ref private @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref<!hal.executable>
+    vm.global.ref private @__multiple_results_memoize_result_0_device_0 : !vm.ref<!hal.command_buffer>
+    vm.rodata private @_utf8_hal_device_id_C6650FF277232B5A {alignment = 1 : i64} "hal.device.id"
+    vm.rodata private @_utf8_local_1A8FF0278D7661D8 {alignment = 1 : i64} "local*"
+    vm.rodata private @_utf8_hal_executable_format_E03EECB63A2AAF52 {alignment = 1 : i64} "hal.executable.format"
+    vm.rodata private @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 {alignment = 1 : i64} "embedded-elf-arm_64"
+    vm.rodata private @multiple_results_dispatch_0_embedded_elf_arm_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8>
+    vm.initializer {
+      %c1 = vm.const.i32 1
+      %null = vm.const.ref.zero : !vm.buffer
+      %c14 = vm.const.i32 14
+      %c-1 = vm.const.i64 -1
+      %c18 = vm.const.i32 18
+      %zero = vm.const.i32.zero
+      %zero_0 = vm.const.i64.zero
+      %c1_1 = vm.const.i64 1
+      %null_2 = vm.const.ref.zero : !vm.ref<!hal.device>
+      %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
+      %1 = vm.ext.i32.i64.s %0 : i32 -> i64
+      vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref<!hal.device>)
+    ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
+      %rnz = vm.cmp.nz.ref %4 : !vm.ref<!hal.device>
+      %5 = vm.xor.i32 %rnz, %c1 : i32
+      %slt = vm.cmp.lt.i64.s %2, %1 : i64
+      %6 = vm.and.i32 %5, %slt : i32
+      vm.cond_br %6, ^bb2, ^bb5
+    ^bb2:  // pred: ^bb1
+      %7 = vm.trunc.i64.i32 %2 : i64 -> i32
+      %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
+      %_utf8_hal_device_id_C6650FF277232B5A = vm.const.ref.rodata @_utf8_hal_device_id_C6650FF277232B5A : !vm.buffer
+      %_utf8_local_1A8FF0278D7661D8 = vm.const.ref.rodata @_utf8_local_1A8FF0278D7661D8 : !vm.buffer
+      %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_C6650FF277232B5A, %_utf8_local_1A8FF0278D7661D8) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
+      %nz = vm.cmp.nz.i64 %8#1 : i64
+      %9 = vm.select.i32 %8#0, %nz, %zero : i32
+      vm.cond_br %9, ^bb3, ^bb4(%zero : i32)
+    ^bb3:  // pred: ^bb2
+      %_utf8_hal_executable_format_E03EECB63A2AAF52 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer
+      %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer
+      %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_E03EECB63A2AAF52, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
+      %nz_3 = vm.cmp.nz.i64 %10#1 : i64
+      %11 = vm.select.i32 %10#0, %nz_3, %zero : i32
+      vm.br ^bb4(%11 : i32)
+    ^bb4(%12: i32):  // 2 preds: ^bb2, ^bb3
+      %eq = vm.cmp.eq.i64 %3, %zero_0 : i64
+      %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64
+      %14 = vm.add.i64 %3, %13 : i64
+      %15 = vm.and.i32 %12, %eq : i32
+      %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref<!hal.device>
+      %16 = vm.add.i64 %2, %c1_1 : i64
+      vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref<!hal.device>)
+    ^bb5:  // pred: ^bb1
+      vm.cond_br %5, ^bb6, ^bb7
+    ^bb6:  // pred: ^bb5
+      vm.fail %c18, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>"
+    ^bb7:  // pred: ^bb5
+      %_utf8_hal_executable_format_E03EECB63A2AAF52_5 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer
+      %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer
+      %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_E03EECB63A2AAF52_5, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
+      %nz_7 = vm.cmp.nz.i64 %17#1 : i64
+      %18 = vm.select.i32 %17#0, %nz_7, %zero : i32
+      %19 = vm.select.i64 %18, %zero_0, %c-1 : i64
+      %eq_8 = vm.cmp.eq.i64 %19, %zero_0 : i64
+      vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
+      vm.cond_br %eq_8, ^bb8, ^bb9
+    ^bb8:  // pred: ^bb7
+      %multiple_results_dispatch_0_embedded_elf_arm_64 = vm.const.ref.rodata @multiple_results_dispatch_0_embedded_elf_arm_64 : !vm.buffer
+      %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_9 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer
+      %ref_10 = vm.call @hal.executable.create(%4, %c-1, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_9, %multiple_results_dispatch_0_embedded_elf_arm_64, %null) {nosideeffects} : (!vm.ref<!hal.device>, i64, !vm.buffer, !vm.buffer, !vm.buffer) -> !vm.ref<!hal.executable>
+      vm.global.store.ref %ref_10, @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref<!hal.executable>
+      %ref_11 = vm.call @__multiple_results_memoize_apply() : () -> !vm.ref<!hal.command_buffer>
+      vm.global.store.ref %ref_11, @__multiple_results_memoize_result_0_device_0 : !vm.ref<!hal.command_buffer>
+      vm.return
+    ^bb9:  // pred: ^bb7
+      vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+    }
+    vm.func private @__multiple_results_memoize_apply() -> !vm.ref<!hal.command_buffer> attributes {inlining_policy = #util.inline.never} {
+      %c13 = vm.const.i32 13
+      %c28 = vm.const.i32 28
+      %c2 = vm.const.i32 2
+      %null = vm.const.ref.zero : !vm.ref<!hal.buffer>
+      %c1 = vm.const.i32 1
+      %c3 = vm.const.i32 3
+      %c64 = vm.const.i32 64
+      %c128 = vm.const.i64 128
+      %c8 = vm.const.i64 8
+      %zero = vm.const.i64.zero
+      %zero_0 = vm.const.i32.zero
+      %c-1 = vm.const.i64 -1
+      %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref<!hal.device>
+      %__device_0_executable_0_multiple_results_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref<!hal.executable>
+      %ref = vm.call @hal.command_buffer.create(%__device_0, %zero_0, %c3, %c-1, %c3) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
+      vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%zero_0], [(%zero_0, %zero_0, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64, i32 ..., tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
+      vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%c64], [(%zero_0, %c1, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64, i32 ..., tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
+      vm.call @hal.command_buffer.execution_barrier(%ref, %c28, %c13, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i64) -> ()
+      vm.call @hal.command_buffer.finalize(%ref) : (!vm.ref<!hal.command_buffer>) -> ()
+      vm.return %ref : !vm.ref<!hal.command_buffer>
+    }
+    vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32)
+    vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects}
+    vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...)
+    vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
+    vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 6 : i32}
+    vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>)
+    vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i64)
+    vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
+    vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects}
+    vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects}
+    vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64, %flags : i64) -> !vm.ref<!hal.buffer>
+    vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffer : !vm.ref<!hal.command_buffer>, %flags : i64, %binding_table : tuple<!vm.ref<!hal.buffer>, i64, i64> ...)
+    vm.import private @hal.devices.count() -> i32 attributes {nosideeffects}
+    vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects}
+    vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref<!hal.executable> attributes {nosideeffects}
+    vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i64) -> !vm.ref<!hal.fence>
+    vm.import private @hal.fence.await(%timeout_millis : i32, %flags : i64, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {vm.yield}
+    vm.rodata private @_utf8_input0_DCE99660CEB3F6B {alignment = 1 : i64} "input0"
+    vm.rodata private @_utf8_tensor_FC1814BC4A58F22A {alignment = 1 : i64} "tensor"
+    vm.rodata private @_utf8_input1_B898B726583C85DA {alignment = 1 : i64} "input1"
+    vm.func private @multiple_results(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> (!vm.ref<!hal.buffer_view>, !vm.ref<!hal.buffer_view>) attributes {iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+      %c16 = vm.const.i32 16
+      %c1 = vm.const.i32 1
+      %c553648160 = vm.const.i32 553648160
+      %c3075 = vm.const.i32 3075
+      %c48 = vm.const.i32 48
+      %c2 = vm.const.i64 2
+      %c8 = vm.const.i64 8
+      %c64 = vm.const.i64 64
+      %c128 = vm.const.i64 128
+      %zero = vm.const.i64.zero
+      %c-1 = vm.const.i64 -1
+      %null = vm.const.ref.zero : !vm.ref<!hal.fence>
+      %c-1_0 = vm.const.i32 -1
+      %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref<!hal.device>
+      %__multiple_results_memoize_result_0_device_0 = vm.global.load.ref immutable @__multiple_results_memoize_result_0_device_0 : !vm.ref<!hal.command_buffer>
+      %_utf8_input0_DCE99660CEB3F6B = vm.const.ref.rodata @_utf8_input0_DCE99660CEB3F6B : !vm.buffer
+      vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DCE99660CEB3F6B, %c553648160, %c1, [%c2]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
+      %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
+      %ref_1 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
+      %_utf8_tensor_FC1814BC4A58F22A = vm.const.ref.rodata @_utf8_tensor_FC1814BC4A58F22A : !vm.buffer
+      vm.call @hal.buffer.assert(%ref, %_utf8_tensor_FC1814BC4A58F22A, %ref_1, %c8, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
+      %_utf8_input1_B898B726583C85DA = vm.const.ref.rodata @_utf8_input1_B898B726583C85DA : !vm.buffer
+      vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_B898B726583C85DA, %c553648160, %c1, [%c2]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
+      %ref_2 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
+      %_utf8_tensor_FC1814BC4A58F22A_3 = vm.const.ref.rodata @_utf8_tensor_FC1814BC4A58F22A : !vm.buffer
+      vm.call @hal.buffer.assert(%ref_2, %_utf8_tensor_FC1814BC4A58F22A_3, %ref_1, %c8, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
+      %ref_4 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i64) -> !vm.ref<!hal.fence>
+      %ref_5 = vm.call @hal.device.queue.alloca(%__device_0, %c-1, %null, %ref_4, %zero, %c48, %c3075, %c128, %zero) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i64, i32, i32, i64, i64) -> !vm.ref<!hal.buffer>
+      %ref_6 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i64) -> !vm.ref<!hal.fence>
+      vm.call.variadic @hal.device.queue.execute.indirect(%__device_0, %c-1, %ref_4, %ref_6, %__multiple_results_memoize_result_0_device_0, %zero, [(%ref, %zero, %c8), (%ref_2, %zero, %c8), (%ref_5, %zero, %c128)]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer>, i64, tuple<!vm.ref<!hal.buffer>, i64, i64> ...)
+      %0 = vm.call.variadic @hal.fence.await(%c-1_0, %zero, [%ref_6]) : (i32, i64, !vm.ref<!hal.fence> ...) -> i32
+      vm.cond_br %0, ^bb2(%0 : i32), ^bb1
+    ^bb1:  // pred: ^bb0
+      %ref_7 = vm.call.variadic @hal.buffer_view.create(%ref_5, %zero, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
+      %ref_8 = vm.call.variadic @hal.buffer_view.create(%ref_5, %c64, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
+      vm.return %ref_7, %ref_8 : !vm.ref<!hal.buffer_view>, !vm.ref<!hal.buffer_view>
+    ^bb2(%1: i32):  // pred: ^bb0
+      vm.fail %1, "failed to wait on timepoint"
+    }
+    vm.export @multiple_results attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}}
+  }
+}
+
+
+// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- //
+module attributes {vm.toplevel} {
+  vm.module public @module {
+    vm.global.ref private @__device_0 : !vm.ref<!hal.device>
+    vm.global.ref private @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref<!hal.executable>
+    vm.global.ref private @__multiple_results_memoize_result_0_device_0 : !vm.ref<!hal.command_buffer>
+    vm.rodata private @_utf8_hal_device_id_C6650FF277232B5A {alignment = 1 : i64} "hal.device.id"
+    vm.rodata private @_utf8_local_1A8FF0278D7661D8 {alignment = 1 : i64} "local*"
+    vm.rodata private @_utf8_hal_executable_format_E03EECB63A2AAF52 {alignment = 1 : i64} "hal.executable.format"
+    vm.rodata private @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 {alignment = 1 : i64} "embedded-elf-arm_64"
+    vm.rodata private @multiple_results_dispatch_0_embedded_elf_arm_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8>
+    vm.initializer {
+      %c1 = vm.const.i32 1
+      %null = vm.const.ref.zero : !vm.buffer
+      %c14 = vm.const.i32 14
+      %c-1 = vm.const.i64 -1
+      %c18 = vm.const.i32 18
+      %zero = vm.const.i32.zero
+      %zero_0 = vm.const.i64.zero
+      %c1_1 = vm.const.i64 1
+      %null_2 = vm.const.ref.zero : !vm.ref<!hal.device>
+      %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
+      %1 = vm.ext.i32.i64.s %0 : i32 -> i64
+      vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref<!hal.device>)
+    ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
+      %rnz = vm.cmp.nz.ref %4 : !vm.ref<!hal.device>
+      %5 = vm.xor.i32 %rnz, %c1 : i32
+      %slt = vm.cmp.lt.i64.s %2, %1 : i64
+      %6 = vm.and.i32 %5, %slt : i32
+      vm.cond_br %6, ^bb2, ^bb5
+    ^bb2:  // pred: ^bb1
+      %7 = vm.trunc.i64.i32 %2 : i64 -> i32
+      %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
+      %_utf8_hal_device_id_C6650FF277232B5A = vm.const.ref.rodata @_utf8_hal_device_id_C6650FF277232B5A : !vm.buffer
+      %_utf8_local_1A8FF0278D7661D8 = vm.const.ref.rodata @_utf8_local_1A8FF0278D7661D8 : !vm.buffer
+      %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_C6650FF277232B5A, %_utf8_local_1A8FF0278D7661D8) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
+      %nz = vm.cmp.nz.i64 %8#1 : i64
+      %9 = vm.select.i32 %8#0, %nz, %zero : i32
+      vm.cond_br %9, ^bb3, ^bb4(%zero : i32)
+    ^bb3:  // pred: ^bb2
+      %_utf8_hal_executable_format_E03EECB63A2AAF52 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer
+      %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer
+      %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_E03EECB63A2AAF52, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
+      %nz_3 = vm.cmp.nz.i64 %10#1 : i64
+      %11 = vm.select.i32 %10#0, %nz_3, %zero : i32
+      vm.br ^bb4(%11 : i32)
+    ^bb4(%12: i32):  // 2 preds: ^bb2, ^bb3
+      %eq = vm.cmp.eq.i64 %3, %zero_0 : i64
+      %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64
+      %14 = vm.add.i64 %3, %13 : i64
+      %15 = vm.and.i32 %12, %eq : i32
+      %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref<!hal.device>
+      %16 = vm.add.i64 %2, %c1_1 : i64
+      vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref<!hal.device>)
+    ^bb5:  // pred: ^bb1
+      vm.cond_br %5, ^bb6, ^bb7
+    ^bb6:  // pred: ^bb5
+      vm.fail %c18, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>"
+    ^bb7:  // pred: ^bb5
+      %_utf8_hal_executable_format_E03EECB63A2AAF52_5 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer
+      %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer
+      %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_E03EECB63A2AAF52_5, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
+      %nz_7 = vm.cmp.nz.i64 %17#1 : i64
+      %18 = vm.select.i32 %17#0, %nz_7, %zero : i32
+      %19 = vm.select.i64 %18, %zero_0, %c-1 : i64
+      %eq_8 = vm.cmp.eq.i64 %19, %zero_0 : i64
+      vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
+      vm.cond_br %eq_8, ^bb8, ^bb9
+    ^bb8:  // pred: ^bb7
+      %multiple_results_dispatch_0_embedded_elf_arm_64 = vm.const.ref.rodata @multiple_results_dispatch_0_embedded_elf_arm_64 : !vm.buffer
+      %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_9 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer
+      %ref_10 = vm.call @hal.executable.create(%4, %c-1, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_9, %multiple_results_dispatch_0_embedded_elf_arm_64, %null) {nosideeffects} : (!vm.ref<!hal.device>, i64, !vm.buffer, !vm.buffer, !vm.buffer) -> !vm.ref<!hal.executable>
+      vm.global.store.ref %ref_10, @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref<!hal.executable>
+      %ref_11 = vm.call @__multiple_results_memoize_apply() : () -> !vm.ref<!hal.command_buffer>
+      vm.global.store.ref %ref_11, @__multiple_results_memoize_result_0_device_0 : !vm.ref<!hal.command_buffer>
+      vm.return
+    ^bb9:  // pred: ^bb7
+      vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+    }
+    vm.func private @__multiple_results_memoize_apply() -> !vm.ref<!hal.command_buffer> attributes {inlining_policy = #util.inline.never} {
+      %c13 = vm.const.i32 13
+      %c28 = vm.const.i32 28
+      %c2 = vm.const.i32 2
+      %null = vm.const.ref.zero : !vm.ref<!hal.buffer>
+      %c1 = vm.const.i32 1
+      %c3 = vm.const.i32 3
+      %c64 = vm.const.i32 64
+      %c128 = vm.const.i64 128
+      %c8 = vm.const.i64 8
+      %zero = vm.const.i64.zero
+      %zero_0 = vm.const.i32.zero
+      %c-1 = vm.const.i64 -1
+      %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref<!hal.device>
+      %__device_0_executable_0_multiple_results_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref<!hal.executable>
+      %ref = vm.call @hal.command_buffer.create(%__device_0, %zero_0, %c3, %c-1, %c3) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
+      vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%zero_0], [(%zero_0, %zero_0, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64, i32 ..., tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
+      vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%c64], [(%zero_0, %c1, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64, i32 ..., tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
+      vm.call @hal.command_buffer.execution_barrier(%ref, %c28, %c13, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i64) -> ()
+      vm.call @hal.command_buffer.finalize(%ref) : (!vm.ref<!hal.command_buffer>) -> ()
+      vm.return %ref : !vm.ref<!hal.command_buffer>
+    }
+    vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32)
+    vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects}
+    vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...)
+    vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
+    vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 6 : i32}
+    vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>)
+    vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i64)
+    vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
+    vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects}
+    vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects}
+    vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64, %flags : i64) -> !vm.ref<!hal.buffer>
+    vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffer : !vm.ref<!hal.command_buffer>, %flags : i64, %binding_table : tuple<!vm.ref<!hal.buffer>, i64, i64> ...)
+    vm.import private @hal.devices.count() -> i32 attributes {nosideeffects}
+    vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects}
+    vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref<!hal.executable> attributes {nosideeffects}
+    vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i64) -> !vm.ref<!hal.fence>
+    vm.import private @hal.fence.await(%timeout_millis : i32, %flags : i64, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {vm.yield}
+    vm.rodata private @_utf8_input0_DCE99660CEB3F6B {alignment = 1 : i64} "input0"
+    vm.rodata private @_utf8_tensor_FC1814BC4A58F22A {alignment = 1 : i64} "tensor"
+    vm.rodata private @_utf8_input1_B898B726583C85DA {alignment = 1 : i64} "input1"
+    vm.func private @multiple_results(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> (!vm.ref<!hal.buffer_view>, !vm.ref<!hal.buffer_view>) attributes {iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+      %c16 = vm.const.i32 16
+      %c1 = vm.const.i32 1
+      %c553648160 = vm.const.i32 553648160
+      %c3075 = vm.const.i32 3075
+      %c48 = vm.const.i32 48
+      %c2 = vm.const.i64 2
+      %c8 = vm.const.i64 8
+      %c64 = vm.const.i64 64
+      %c128 = vm.const.i64 128
+      %zero = vm.const.i64.zero
+      %c-1 = vm.const.i64 -1
+      %null = vm.const.ref.zero : !vm.ref<!hal.fence>
+      %c-1_0 = vm.const.i32 -1
+      %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref<!hal.device>
+      %__multiple_results_memoize_result_0_device_0 = vm.global.load.ref immutable @__multiple_results_memoize_result_0_device_0 : !vm.ref<!hal.command_buffer>
+      %_utf8_input0_DCE99660CEB3F6B = vm.const.ref.rodata @_utf8_input0_DCE99660CEB3F6B : !vm.buffer
+      vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DCE99660CEB3F6B, %c553648160, %c1, [%c2]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
+      %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
+      %ref_1 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
+      %_utf8_tensor_FC1814BC4A58F22A = vm.const.ref.rodata @_utf8_tensor_FC1814BC4A58F22A : !vm.buffer
+      vm.call @hal.buffer.assert(%ref, %_utf8_tensor_FC1814BC4A58F22A, %ref_1, %c8, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
+      %_utf8_input1_B898B726583C85DA = vm.const.ref.rodata @_utf8_input1_B898B726583C85DA : !vm.buffer
+      vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_B898B726583C85DA, %c553648160, %c1, [%c2]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
+      %ref_2 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
+      %_utf8_tensor_FC1814BC4A58F22A_3 = vm.const.ref.rodata @_utf8_tensor_FC1814BC4A58F22A : !vm.buffer
+      vm.call @hal.buffer.assert(%ref_2, %_utf8_tensor_FC1814BC4A58F22A_3, %ref_1, %c8, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
+      %ref_4 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i64) -> !vm.ref<!hal.fence>
+      %ref_5 = vm.call @hal.device.queue.alloca(%__device_0, %c-1, %null, %ref_4, %zero, %c48, %c3075, %c128, %zero) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i64, i32, i32, i64, i64) -> !vm.ref<!hal.buffer>
+      %ref_6 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i64) -> !vm.ref<!hal.fence>
+      vm.call.variadic @hal.device.queue.execute.indirect(%__device_0, %c-1, %ref_4, %ref_6, %__multiple_results_memoize_result_0_device_0, %zero, [(%ref, %zero, %c8), (%ref_2, %zero, %c8), (%ref_5, %zero, %c128)]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer>, i64, tuple<!vm.ref<!hal.buffer>, i64, i64> ...)
+      %0 = vm.call.variadic @hal.fence.await(%c-1_0, %zero, [%ref_6]) : (i32, i64, !vm.ref<!hal.fence> ...) -> i32
+      vm.cond_br %0, ^bb2(%0 : i32), ^bb1
+    ^bb1:  // pred: ^bb0
+      %ref_7 = vm.call.variadic @hal.buffer_view.create(%ref_5, %zero, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
+      %ref_8 = vm.call.variadic @hal.buffer_view.create(%ref_5, %c64, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
+      vm.return %ref_7, %ref_8 : !vm.ref<!hal.buffer_view>, !vm.ref<!hal.buffer_view>
+    ^bb2(%1: i32):  // pred: ^bb0
+      vm.fail %1, "failed to wait on timepoint"
+    }
+    vm.export @multiple_results attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}}
+  }
+}
+
+
+// -----// IR Dump After ResolveRodataLoadsPass (iree-vm-resolve-rodata-loads) //----- //
+vm.module public @module {
+  vm.global.ref private @__device_0 : !vm.ref<!hal.device>
+  vm.global.ref private @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref<!hal.executable>
+  vm.global.ref private @__multiple_results_memoize_result_0_device_0 : !vm.ref<!hal.command_buffer>
+  vm.rodata private @_utf8_hal_device_id_C6650FF277232B5A {alignment = 1 : i64} "hal.device.id"
+  vm.rodata private @_utf8_local_1A8FF0278D7661D8 {alignment = 1 : i64} "local*"
+  vm.rodata private @_utf8_hal_executable_format_E03EECB63A2AAF52 {alignment = 1 : i64} "hal.executable.format"
+  vm.rodata private @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 {alignment = 1 : i64} "embedded-elf-arm_64"
+  vm.rodata private @multiple_results_dispatch_0_embedded_elf_arm_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8>
+  vm.initializer {
+    %c1 = vm.const.i32 1
+    %null = vm.const.ref.zero : !vm.buffer
+    %c14 = vm.const.i32 14
+    %c-1 = vm.const.i64 -1
+    %c18 = vm.const.i32 18
+    %zero = vm.const.i32.zero
+    %zero_0 = vm.const.i64.zero
+    %c1_1 = vm.const.i64 1
+    %null_2 = vm.const.ref.zero : !vm.ref<!hal.device>
+    %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
+    %1 = vm.ext.i32.i64.s %0 : i32 -> i64
+    vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref<!hal.device>)
+  ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
+    %rnz = vm.cmp.nz.ref %4 : !vm.ref<!hal.device>
+    %5 = vm.xor.i32 %rnz, %c1 : i32
+    %slt = vm.cmp.lt.i64.s %2, %1 : i64
+    %6 = vm.and.i32 %5, %slt : i32
+    vm.cond_br %6, ^bb2, ^bb5
+  ^bb2:  // pred: ^bb1
+    %7 = vm.trunc.i64.i32 %2 : i64 -> i32
+    %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
+    %_utf8_hal_device_id_C6650FF277232B5A = vm.const.ref.rodata @_utf8_hal_device_id_C6650FF277232B5A : !vm.buffer
+    %_utf8_local_1A8FF0278D7661D8 = vm.const.ref.rodata @_utf8_local_1A8FF0278D7661D8 : !vm.buffer
+    %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_C6650FF277232B5A, %_utf8_local_1A8FF0278D7661D8) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
+    %nz = vm.cmp.nz.i64 %8#1 : i64
+    %9 = vm.select.i32 %8#0, %nz, %zero : i32
+    vm.cond_br %9, ^bb3, ^bb4(%zero : i32)
+  ^bb3:  // pred: ^bb2
+    %_utf8_hal_executable_format_E03EECB63A2AAF52 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer
+    %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer
+    %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_E03EECB63A2AAF52, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
+    %nz_3 = vm.cmp.nz.i64 %10#1 : i64
+    %11 = vm.select.i32 %10#0, %nz_3, %zero : i32
+    vm.br ^bb4(%11 : i32)
+  ^bb4(%12: i32):  // 2 preds: ^bb2, ^bb3
+    %eq = vm.cmp.eq.i64 %3, %zero_0 : i64
+    %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64
+    %14 = vm.add.i64 %3, %13 : i64
+    %15 = vm.and.i32 %12, %eq : i32
+    %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref<!hal.device>
+    %16 = vm.add.i64 %2, %c1_1 : i64
+    vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref<!hal.device>)
+  ^bb5:  // pred: ^bb1
+    vm.cond_br %5, ^bb6, ^bb7
+  ^bb6:  // pred: ^bb5
+    vm.fail %c18, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>"
+  ^bb7:  // pred: ^bb5
+    %_utf8_hal_executable_format_E03EECB63A2AAF52_5 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer
+    %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer
+    %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_E03EECB63A2AAF52_5, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
+    %nz_7 = vm.cmp.nz.i64 %17#1 : i64
+    %18 = vm.select.i32 %17#0, %nz_7, %zero : i32
+    %19 = vm.select.i64 %18, %zero_0, %c-1 : i64
+    %eq_8 = vm.cmp.eq.i64 %19, %zero_0 : i64
+    vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
+    vm.cond_br %eq_8, ^bb8, ^bb9
+  ^bb8:  // pred: ^bb7
+    %multiple_results_dispatch_0_embedded_elf_arm_64 = vm.const.ref.rodata @multiple_results_dispatch_0_embedded_elf_arm_64 : !vm.buffer
+    %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_9 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer
+    %ref_10 = vm.call @hal.executable.create(%4, %c-1, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_9, %multiple_results_dispatch_0_embedded_elf_arm_64, %null) {nosideeffects} : (!vm.ref<!hal.device>, i64, !vm.buffer, !vm.buffer, !vm.buffer) -> !vm.ref<!hal.executable>
+    vm.global.store.ref %ref_10, @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref<!hal.executable>
+    %ref_11 = vm.call @__multiple_results_memoize_apply() : () -> !vm.ref<!hal.command_buffer>
+    vm.global.store.ref %ref_11, @__multiple_results_memoize_result_0_device_0 : !vm.ref<!hal.command_buffer>
+    vm.return
+  ^bb9:  // pred: ^bb7
+    vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+  }
+  vm.func private @__multiple_results_memoize_apply() -> !vm.ref<!hal.command_buffer> attributes {inlining_policy = #util.inline.never} {
+    %c13 = vm.const.i32 13
+    %c28 = vm.const.i32 28
+    %c2 = vm.const.i32 2
+    %null = vm.const.ref.zero : !vm.ref<!hal.buffer>
+    %c1 = vm.const.i32 1
+    %c3 = vm.const.i32 3
+    %c64 = vm.const.i32 64
+    %c128 = vm.const.i64 128
+    %c8 = vm.const.i64 8
+    %zero = vm.const.i64.zero
+    %zero_0 = vm.const.i32.zero
+    %c-1 = vm.const.i64 -1
+    %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref<!hal.device>
+    %__device_0_executable_0_multiple_results_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref<!hal.executable>
+    %ref = vm.call @hal.command_buffer.create(%__device_0, %zero_0, %c3, %c-1, %c3) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
+    vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%zero_0], [(%zero_0, %zero_0, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64, i32 ..., tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
+    vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%c64], [(%zero_0, %c1, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64, i32 ..., tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
+    vm.call @hal.command_buffer.execution_barrier(%ref, %c28, %c13, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i64) -> ()
+    vm.call @hal.command_buffer.finalize(%ref) : (!vm.ref<!hal.command_buffer>) -> ()
+    vm.return %ref : !vm.ref<!hal.command_buffer>
+  }
+  vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32)
+  vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects}
+  vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...)
+  vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
+  vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 6 : i32}
+  vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>)
+  vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i64)
+  vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
+  vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects}
+  vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects}
+  vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64, %flags : i64) -> !vm.ref<!hal.buffer>
+  vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffer : !vm.ref<!hal.command_buffer>, %flags : i64, %binding_table : tuple<!vm.ref<!hal.buffer>, i64, i64> ...)
+  vm.import private @hal.devices.count() -> i32 attributes {nosideeffects}
+  vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects}
+  vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref<!hal.executable> attributes {nosideeffects}
+  vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i64) -> !vm.ref<!hal.fence>
+  vm.import private @hal.fence.await(%timeout_millis : i32, %flags : i64, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {vm.yield}
+  vm.rodata private @_utf8_input0_DCE99660CEB3F6B {alignment = 1 : i64} "input0"
+  vm.rodata private @_utf8_tensor_FC1814BC4A58F22A {alignment = 1 : i64} "tensor"
+  vm.rodata private @_utf8_input1_B898B726583C85DA {alignment = 1 : i64} "input1"
+  vm.func private @multiple_results(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> (!vm.ref<!hal.buffer_view>, !vm.ref<!hal.buffer_view>) attributes {iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c16 = vm.const.i32 16
+    %c1 = vm.const.i32 1
+    %c553648160 = vm.const.i32 553648160
+    %c3075 = vm.const.i32 3075
+    %c48 = vm.const.i32 48
+    %c2 = vm.const.i64 2
+    %c8 = vm.const.i64 8
+    %c64 = vm.const.i64 64
+    %c128 = vm.const.i64 128
+    %zero = vm.const.i64.zero
+    %c-1 = vm.const.i64 -1
+    %null = vm.const.ref.zero : !vm.ref<!hal.fence>
+    %c-1_0 = vm.const.i32 -1
+    %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref<!hal.device>
+    %__multiple_results_memoize_result_0_device_0 = vm.global.load.ref immutable @__multiple_results_memoize_result_0_device_0 : !vm.ref<!hal.command_buffer>
+    %_utf8_input0_DCE99660CEB3F6B = vm.const.ref.rodata @_utf8_input0_DCE99660CEB3F6B : !vm.buffer
+    vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DCE99660CEB3F6B, %c553648160, %c1, [%c2]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
+    %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
+    %ref_1 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
+    %_utf8_tensor_FC1814BC4A58F22A = vm.const.ref.rodata @_utf8_tensor_FC1814BC4A58F22A : !vm.buffer
+    vm.call @hal.buffer.assert(%ref, %_utf8_tensor_FC1814BC4A58F22A, %ref_1, %c8, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
+    %_utf8_input1_B898B726583C85DA = vm.const.ref.rodata @_utf8_input1_B898B726583C85DA : !vm.buffer
+    vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_B898B726583C85DA, %c553648160, %c1, [%c2]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
+    %ref_2 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
+    %_utf8_tensor_FC1814BC4A58F22A_3 = vm.const.ref.rodata @_utf8_tensor_FC1814BC4A58F22A : !vm.buffer
+    vm.call @hal.buffer.assert(%ref_2, %_utf8_tensor_FC1814BC4A58F22A_3, %ref_1, %c8, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
+    %ref_4 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i64) -> !vm.ref<!hal.fence>
+    %ref_5 = vm.call @hal.device.queue.alloca(%__device_0, %c-1, %null, %ref_4, %zero, %c48, %c3075, %c128, %zero) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i64, i32, i32, i64, i64) -> !vm.ref<!hal.buffer>
+    %ref_6 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i64) -> !vm.ref<!hal.fence>
+    vm.call.variadic @hal.device.queue.execute.indirect(%__device_0, %c-1, %ref_4, %ref_6, %__multiple_results_memoize_result_0_device_0, %zero, [(%ref, %zero, %c8), (%ref_2, %zero, %c8), (%ref_5, %zero, %c128)]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer>, i64, tuple<!vm.ref<!hal.buffer>, i64, i64> ...)
+    %0 = vm.call.variadic @hal.fence.await(%c-1_0, %zero, [%ref_6]) : (i32, i64, !vm.ref<!hal.fence> ...) -> i32
+    vm.cond_br %0, ^bb2(%0 : i32), ^bb1
+  ^bb1:  // pred: ^bb0
+    %ref_7 = vm.call.variadic @hal.buffer_view.create(%ref_5, %zero, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
+    %ref_8 = vm.call.variadic @hal.buffer_view.create(%ref_5, %c64, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
+    vm.return %ref_7, %ref_8 : !vm.ref<!hal.buffer_view>, !vm.ref<!hal.buffer_view>
+  ^bb2(%1: i32):  // pred: ^bb0
+    vm.fail %1, "failed to wait on timepoint"
+  }
+  vm.export @multiple_results attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}}
+}
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+vm.func private @__multiple_results_memoize_apply() -> !vm.ref<!hal.command_buffer> attributes {inlining_policy = #util.inline.never} {
+  %c13 = vm.const.i32 13
+  %c28 = vm.const.i32 28
+  %c2 = vm.const.i32 2
+  %null = vm.const.ref.zero : !vm.ref<!hal.buffer>
+  %c1 = vm.const.i32 1
+  %c3 = vm.const.i32 3
+  %c64 = vm.const.i32 64
+  %c128 = vm.const.i64 128
+  %c8 = vm.const.i64 8
+  %zero = vm.const.i64.zero
+  %zero_0 = vm.const.i32.zero
+  %c-1 = vm.const.i64 -1
+  %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref<!hal.device>
+  %__device_0_executable_0_multiple_results_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref<!hal.executable>
+  %ref = vm.call @hal.command_buffer.create(%__device_0, %zero_0, %c3, %c-1, %c3) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
+  vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%zero_0], [(%zero_0, %zero_0, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64, i32 ..., tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
+  vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%c64], [(%zero_0, %c1, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64, i32 ..., tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
+  vm.call @hal.command_buffer.execution_barrier(%ref, %c28, %c13, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i64) -> ()
+  vm.call @hal.command_buffer.finalize(%ref) : (!vm.ref<!hal.command_buffer>) -> ()
+  vm.return %ref : !vm.ref<!hal.command_buffer>
+}
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+vm.initializer {
+  %c1 = vm.const.i32 1
+  %null = vm.const.ref.zero : !vm.buffer
+  %c14 = vm.const.i32 14
+  %c-1 = vm.const.i64 -1
+  %c18 = vm.const.i32 18
+  %zero = vm.const.i32.zero
+  %zero_0 = vm.const.i64.zero
+  %c1_1 = vm.const.i64 1
+  %null_2 = vm.const.ref.zero : !vm.ref<!hal.device>
+  %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
+  %1 = vm.ext.i32.i64.s %0 : i32 -> i64
+  vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref<!hal.device>)
+^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
+  %rnz = vm.cmp.nz.ref %4 : !vm.ref<!hal.device>
+  %5 = vm.xor.i32 %rnz, %c1 : i32
+  %slt = vm.cmp.lt.i64.s %2, %1 : i64
+  %6 = vm.and.i32 %5, %slt : i32
+  vm.cond_br %6, ^bb2, ^bb5
+^bb2:  // pred: ^bb1
+  %7 = vm.trunc.i64.i32 %2 : i64 -> i32
+  %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
+  %_utf8_hal_device_id_C6650FF277232B5A = vm.const.ref.rodata @_utf8_hal_device_id_C6650FF277232B5A : !vm.buffer
+  %_utf8_local_1A8FF0278D7661D8 = vm.const.ref.rodata @_utf8_local_1A8FF0278D7661D8 : !vm.buffer
+  %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_C6650FF277232B5A, %_utf8_local_1A8FF0278D7661D8) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
+  %nz = vm.cmp.nz.i64 %8#1 : i64
+  %9 = vm.select.i32 %8#0, %nz, %zero : i32
+  vm.cond_br %9, ^bb3, ^bb4(%zero : i32)
+^bb3:  // pred: ^bb2
+  %_utf8_hal_executable_format_E03EECB63A2AAF52 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer
+  %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer
+  %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_E03EECB63A2AAF52, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
+  %nz_3 = vm.cmp.nz.i64 %10#1 : i64
+  %11 = vm.select.i32 %10#0, %nz_3, %zero : i32
+  vm.br ^bb4(%11 : i32)
+^bb4(%12: i32):  // 2 preds: ^bb2, ^bb3
+  %eq = vm.cmp.eq.i64 %3, %zero_0 : i64
+  %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64
+  %14 = vm.add.i64 %3, %13 : i64
+  %15 = vm.and.i32 %12, %eq : i32
+  %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref<!hal.device>
+  %16 = vm.add.i64 %2, %c1_1 : i64
+  vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref<!hal.device>)
+^bb5:  // pred: ^bb1
+  vm.cond_br %5, ^bb6, ^bb7
+^bb6:  // pred: ^bb5
+  vm.fail %c18, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>"
+^bb7:  // pred: ^bb5
+  %_utf8_hal_executable_format_E03EECB63A2AAF52_5 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer
+  %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer
+  %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_E03EECB63A2AAF52_5, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
+  %nz_7 = vm.cmp.nz.i64 %17#1 : i64
+  %18 = vm.select.i32 %17#0, %nz_7, %zero : i32
+  %19 = vm.select.i64 %18, %zero_0, %c-1 : i64
+  %eq_8 = vm.cmp.eq.i64 %19, %zero_0 : i64
+  vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
+  vm.cond_br %eq_8, ^bb8, ^bb9
+^bb8:  // pred: ^bb7
+  %multiple_results_dispatch_0_embedded_elf_arm_64 = vm.const.ref.rodata @multiple_results_dispatch_0_embedded_elf_arm_64 : !vm.buffer
+  %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_9 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer
+  %ref_10 = vm.call @hal.executable.create(%4, %c-1, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_9, %multiple_results_dispatch_0_embedded_elf_arm_64, %null) {nosideeffects} : (!vm.ref<!hal.device>, i64, !vm.buffer, !vm.buffer, !vm.buffer) -> !vm.ref<!hal.executable>
+  vm.global.store.ref %ref_10, @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref<!hal.executable>
+  %ref_11 = vm.call @__multiple_results_memoize_apply() : () -> !vm.ref<!hal.command_buffer>
+  vm.global.store.ref %ref_11, @__multiple_results_memoize_result_0_device_0 : !vm.ref<!hal.command_buffer>
+  vm.return
+^bb9:  // pred: ^bb7
+  vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+}
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+vm.func private @multiple_results(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> (!vm.ref<!hal.buffer_view>, !vm.ref<!hal.buffer_view>) attributes {iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+  %c16 = vm.const.i32 16
+  %c1 = vm.const.i32 1
+  %c553648160 = vm.const.i32 553648160
+  %c3075 = vm.const.i32 3075
+  %c48 = vm.const.i32 48
+  %c2 = vm.const.i64 2
+  %c8 = vm.const.i64 8
+  %c64 = vm.const.i64 64
+  %c128 = vm.const.i64 128
+  %zero = vm.const.i64.zero
+  %c-1 = vm.const.i64 -1
+  %null = vm.const.ref.zero : !vm.ref<!hal.fence>
+  %c-1_0 = vm.const.i32 -1
+  %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref<!hal.device>
+  %__multiple_results_memoize_result_0_device_0 = vm.global.load.ref immutable @__multiple_results_memoize_result_0_device_0 : !vm.ref<!hal.command_buffer>
+  %_utf8_input0_DCE99660CEB3F6B = vm.const.ref.rodata @_utf8_input0_DCE99660CEB3F6B : !vm.buffer
+  vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DCE99660CEB3F6B, %c553648160, %c1, [%c2]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
+  %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
+  %ref_1 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
+  %_utf8_tensor_FC1814BC4A58F22A = vm.const.ref.rodata @_utf8_tensor_FC1814BC4A58F22A : !vm.buffer
+  vm.call @hal.buffer.assert(%ref, %_utf8_tensor_FC1814BC4A58F22A, %ref_1, %c8, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
+  %_utf8_input1_B898B726583C85DA = vm.const.ref.rodata @_utf8_input1_B898B726583C85DA : !vm.buffer
+  vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_B898B726583C85DA, %c553648160, %c1, [%c2]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
+  %ref_2 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
+  %_utf8_tensor_FC1814BC4A58F22A_3 = vm.const.ref.rodata @_utf8_tensor_FC1814BC4A58F22A : !vm.buffer
+  vm.call @hal.buffer.assert(%ref_2, %_utf8_tensor_FC1814BC4A58F22A_3, %ref_1, %c8, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
+  %ref_4 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i64) -> !vm.ref<!hal.fence>
+  %ref_5 = vm.call @hal.device.queue.alloca(%__device_0, %c-1, %null, %ref_4, %zero, %c48, %c3075, %c128, %zero) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i64, i32, i32, i64, i64) -> !vm.ref<!hal.buffer>
+  %ref_6 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i64) -> !vm.ref<!hal.fence>
+  vm.call.variadic @hal.device.queue.execute.indirect(%__device_0, %c-1, %ref_4, %ref_6, %__multiple_results_memoize_result_0_device_0, %zero, [(%ref, %zero, %c8), (%ref_2, %zero, %c8), (%ref_5, %zero, %c128)]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer>, i64, tuple<!vm.ref<!hal.buffer>, i64, i64> ...)
+  %0 = vm.call.variadic @hal.fence.await(%c-1_0, %zero, [%ref_6]) : (i32, i64, !vm.ref<!hal.fence> ...) -> i32
+  vm.cond_br %0, ^bb2(%0 : i32), ^bb1
+^bb1:  // pred: ^bb0
+  %ref_7 = vm.call.variadic @hal.buffer_view.create(%ref_5, %zero, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
+  %ref_8 = vm.call.variadic @hal.buffer_view.create(%ref_5, %c64, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
+  vm.return %ref_7, %ref_8 : !vm.ref<!hal.buffer_view>, !vm.ref<!hal.buffer_view>
+^bb2(%1: i32):  // pred: ^bb0
+  vm.fail %1, "failed to wait on timepoint"
+}
+
+// -----// IR Dump After Inliner (inline) //----- //
+module attributes {vm.toplevel} {
+  vm.module public @module {
+    vm.global.ref private @__device_0 : !vm.ref<!hal.device>
+    vm.global.ref private @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref<!hal.executable>
+    vm.global.ref private @__multiple_results_memoize_result_0_device_0 : !vm.ref<!hal.command_buffer>
+    vm.rodata private @_utf8_hal_device_id_C6650FF277232B5A {alignment = 1 : i64} "hal.device.id"
+    vm.rodata private @_utf8_local_1A8FF0278D7661D8 {alignment = 1 : i64} "local*"
+    vm.rodata private @_utf8_hal_executable_format_E03EECB63A2AAF52 {alignment = 1 : i64} "hal.executable.format"
+    vm.rodata private @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 {alignment = 1 : i64} "embedded-elf-arm_64"
+    vm.rodata private @multiple_results_dispatch_0_embedded_elf_arm_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8>
+    vm.initializer {
+      %c1 = vm.const.i32 1
+      %null = vm.const.ref.zero : !vm.buffer
+      %c14 = vm.const.i32 14
+      %c-1 = vm.const.i64 -1
+      %c18 = vm.const.i32 18
+      %zero = vm.const.i32.zero
+      %zero_0 = vm.const.i64.zero
+      %c1_1 = vm.const.i64 1
+      %null_2 = vm.const.ref.zero : !vm.ref<!hal.device>
+      %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
+      %1 = vm.ext.i32.i64.s %0 : i32 -> i64
+      vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref<!hal.device>)
+    ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
+      %rnz = vm.cmp.nz.ref %4 : !vm.ref<!hal.device>
+      %5 = vm.xor.i32 %rnz, %c1 : i32
+      %slt = vm.cmp.lt.i64.s %2, %1 : i64
+      %6 = vm.and.i32 %5, %slt : i32
+      vm.cond_br %6, ^bb2, ^bb5
+    ^bb2:  // pred: ^bb1
+      %7 = vm.trunc.i64.i32 %2 : i64 -> i32
+      %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
+      %_utf8_hal_device_id_C6650FF277232B5A = vm.const.ref.rodata @_utf8_hal_device_id_C6650FF277232B5A : !vm.buffer
+      %_utf8_local_1A8FF0278D7661D8 = vm.const.ref.rodata @_utf8_local_1A8FF0278D7661D8 : !vm.buffer
+      %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_C6650FF277232B5A, %_utf8_local_1A8FF0278D7661D8) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
+      %nz = vm.cmp.nz.i64 %8#1 : i64
+      %9 = vm.select.i32 %8#0, %nz, %zero : i32
+      vm.cond_br %9, ^bb3, ^bb4(%zero : i32)
+    ^bb3:  // pred: ^bb2
+      %_utf8_hal_executable_format_E03EECB63A2AAF52 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer
+      %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer
+      %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_E03EECB63A2AAF52, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
+      %nz_3 = vm.cmp.nz.i64 %10#1 : i64
+      %11 = vm.select.i32 %10#0, %nz_3, %zero : i32
+      vm.br ^bb4(%11 : i32)
+    ^bb4(%12: i32):  // 2 preds: ^bb2, ^bb3
+      %eq = vm.cmp.eq.i64 %3, %zero_0 : i64
+      %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64
+      %14 = vm.add.i64 %3, %13 : i64
+      %15 = vm.and.i32 %12, %eq : i32
+      %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref<!hal.device>
+      %16 = vm.add.i64 %2, %c1_1 : i64
+      vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref<!hal.device>)
+    ^bb5:  // pred: ^bb1
+      vm.cond_br %5, ^bb6, ^bb7
+    ^bb6:  // pred: ^bb5
+      vm.fail %c18, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>"
+    ^bb7:  // pred: ^bb5
+      %_utf8_hal_executable_format_E03EECB63A2AAF52_5 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer
+      %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer
+      %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_E03EECB63A2AAF52_5, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
+      %nz_7 = vm.cmp.nz.i64 %17#1 : i64
+      %18 = vm.select.i32 %17#0, %nz_7, %zero : i32
+      %19 = vm.select.i64 %18, %zero_0, %c-1 : i64
+      %eq_8 = vm.cmp.eq.i64 %19, %zero_0 : i64
+      vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
+      vm.cond_br %eq_8, ^bb8, ^bb9
+    ^bb8:  // pred: ^bb7
+      %multiple_results_dispatch_0_embedded_elf_arm_64 = vm.const.ref.rodata @multiple_results_dispatch_0_embedded_elf_arm_64 : !vm.buffer
+      %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_9 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer
+      %ref_10 = vm.call @hal.executable.create(%4, %c-1, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_9, %multiple_results_dispatch_0_embedded_elf_arm_64, %null) {nosideeffects} : (!vm.ref<!hal.device>, i64, !vm.buffer, !vm.buffer, !vm.buffer) -> !vm.ref<!hal.executable>
+      vm.global.store.ref %ref_10, @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref<!hal.executable>
+      %ref_11 = vm.call @__multiple_results_memoize_apply() : () -> !vm.ref<!hal.command_buffer>
+      vm.global.store.ref %ref_11, @__multiple_results_memoize_result_0_device_0 : !vm.ref<!hal.command_buffer>
+      vm.return
+    ^bb9:  // pred: ^bb7
+      vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+    }
+    vm.func private @__multiple_results_memoize_apply() -> !vm.ref<!hal.command_buffer> attributes {inlining_policy = #util.inline.never} {
+      %c13 = vm.const.i32 13
+      %c28 = vm.const.i32 28
+      %c2 = vm.const.i32 2
+      %null = vm.const.ref.zero : !vm.ref<!hal.buffer>
+      %c1 = vm.const.i32 1
+      %c3 = vm.const.i32 3
+      %c64 = vm.const.i32 64
+      %c128 = vm.const.i64 128
+      %c8 = vm.const.i64 8
+      %zero = vm.const.i64.zero
+      %zero_0 = vm.const.i32.zero
+      %c-1 = vm.const.i64 -1
+      %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref<!hal.device>
+      %__device_0_executable_0_multiple_results_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref<!hal.executable>
+      %ref = vm.call @hal.command_buffer.create(%__device_0, %zero_0, %c3, %c-1, %c3) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
+      vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%zero_0], [(%zero_0, %zero_0, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64, i32 ..., tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
+      vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%c64], [(%zero_0, %c1, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64, i32 ..., tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
+      vm.call @hal.command_buffer.execution_barrier(%ref, %c28, %c13, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i64) -> ()
+      vm.call @hal.command_buffer.finalize(%ref) : (!vm.ref<!hal.command_buffer>) -> ()
+      vm.return %ref : !vm.ref<!hal.command_buffer>
+    }
+    vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32)
+    vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects}
+    vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...)
+    vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
+    vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 6 : i32}
+    vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>)
+    vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i64)
+    vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
+    vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects}
+    vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects}
+    vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64, %flags : i64) -> !vm.ref<!hal.buffer>
+    vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffer : !vm.ref<!hal.command_buffer>, %flags : i64, %binding_table : tuple<!vm.ref<!hal.buffer>, i64, i64> ...)
+    vm.import private @hal.devices.count() -> i32 attributes {nosideeffects}
+    vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects}
+    vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref<!hal.executable> attributes {nosideeffects}
+    vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i64) -> !vm.ref<!hal.fence>
+    vm.import private @hal.fence.await(%timeout_millis : i32, %flags : i64, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {vm.yield}
+    vm.rodata private @_utf8_input0_DCE99660CEB3F6B {alignment = 1 : i64} "input0"
+    vm.rodata private @_utf8_tensor_FC1814BC4A58F22A {alignment = 1 : i64} "tensor"
+    vm.rodata private @_utf8_input1_B898B726583C85DA {alignment = 1 : i64} "input1"
+    vm.func private @multiple_results(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> (!vm.ref<!hal.buffer_view>, !vm.ref<!hal.buffer_view>) attributes {iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+      %c16 = vm.const.i32 16
+      %c1 = vm.const.i32 1
+      %c553648160 = vm.const.i32 553648160
+      %c3075 = vm.const.i32 3075
+      %c48 = vm.const.i32 48
+      %c2 = vm.const.i64 2
+      %c8 = vm.const.i64 8
+      %c64 = vm.const.i64 64
+      %c128 = vm.const.i64 128
+      %zero = vm.const.i64.zero
+      %c-1 = vm.const.i64 -1
+      %null = vm.const.ref.zero : !vm.ref<!hal.fence>
+      %c-1_0 = vm.const.i32 -1
+      %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref<!hal.device>
+      %__multiple_results_memoize_result_0_device_0 = vm.global.load.ref immutable @__multiple_results_memoize_result_0_device_0 : !vm.ref<!hal.command_buffer>
+      %_utf8_input0_DCE99660CEB3F6B = vm.const.ref.rodata @_utf8_input0_DCE99660CEB3F6B : !vm.buffer
+      vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DCE99660CEB3F6B, %c553648160, %c1, [%c2]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
+      %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
+      %ref_1 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
+      %_utf8_tensor_FC1814BC4A58F22A = vm.const.ref.rodata @_utf8_tensor_FC1814BC4A58F22A : !vm.buffer
+      vm.call @hal.buffer.assert(%ref, %_utf8_tensor_FC1814BC4A58F22A, %ref_1, %c8, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
+      %_utf8_input1_B898B726583C85DA = vm.const.ref.rodata @_utf8_input1_B898B726583C85DA : !vm.buffer
+      vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_B898B726583C85DA, %c553648160, %c1, [%c2]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
+      %ref_2 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
+      %_utf8_tensor_FC1814BC4A58F22A_3 = vm.const.ref.rodata @_utf8_tensor_FC1814BC4A58F22A : !vm.buffer
+      vm.call @hal.buffer.assert(%ref_2, %_utf8_tensor_FC1814BC4A58F22A_3, %ref_1, %c8, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
+      %ref_4 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i64) -> !vm.ref<!hal.fence>
+      %ref_5 = vm.call @hal.device.queue.alloca(%__device_0, %c-1, %null, %ref_4, %zero, %c48, %c3075, %c128, %zero) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i64, i32, i32, i64, i64) -> !vm.ref<!hal.buffer>
+      %ref_6 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i64) -> !vm.ref<!hal.fence>
+      vm.call.variadic @hal.device.queue.execute.indirect(%__device_0, %c-1, %ref_4, %ref_6, %__multiple_results_memoize_result_0_device_0, %zero, [(%ref, %zero, %c8), (%ref_2, %zero, %c8), (%ref_5, %zero, %c128)]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer>, i64, tuple<!vm.ref<!hal.buffer>, i64, i64> ...)
+      %0 = vm.call.variadic @hal.fence.await(%c-1_0, %zero, [%ref_6]) : (i32, i64, !vm.ref<!hal.fence> ...) -> i32
+      vm.cond_br %0, ^bb2(%0 : i32), ^bb1
+    ^bb1:  // pred: ^bb0
+      %ref_7 = vm.call.variadic @hal.buffer_view.create(%ref_5, %zero, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
+      %ref_8 = vm.call.variadic @hal.buffer_view.create(%ref_5, %c64, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
+      vm.return %ref_7, %ref_8 : !vm.ref<!hal.buffer_view>, !vm.ref<!hal.buffer_view>
+    ^bb2(%1: i32):  // pred: ^bb0
+      vm.fail %1, "failed to wait on timepoint"
+    }
+    vm.export @multiple_results attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}}
+  }
+}
+
+
+// -----// IR Dump After SymbolDCE (symbol-dce) //----- //
+module attributes {vm.toplevel} {
+  vm.module public @module {
+    vm.global.ref private @__device_0 : !vm.ref<!hal.device>
+    vm.global.ref private @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref<!hal.executable>
+    vm.global.ref private @__multiple_results_memoize_result_0_device_0 : !vm.ref<!hal.command_buffer>
+    vm.rodata private @_utf8_hal_device_id_C6650FF277232B5A {alignment = 1 : i64} "hal.device.id"
+    vm.rodata private @_utf8_local_1A8FF0278D7661D8 {alignment = 1 : i64} "local*"
+    vm.rodata private @_utf8_hal_executable_format_E03EECB63A2AAF52 {alignment = 1 : i64} "hal.executable.format"
+    vm.rodata private @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 {alignment = 1 : i64} "embedded-elf-arm_64"
+    vm.rodata private @multiple_results_dispatch_0_embedded_elf_arm_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8>
+    vm.initializer {
+      %c1 = vm.const.i32 1
+      %null = vm.const.ref.zero : !vm.buffer
+      %c14 = vm.const.i32 14
+      %c-1 = vm.const.i64 -1
+      %c18 = vm.const.i32 18
+      %zero = vm.const.i32.zero
+      %zero_0 = vm.const.i64.zero
+      %c1_1 = vm.const.i64 1
+      %null_2 = vm.const.ref.zero : !vm.ref<!hal.device>
+      %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
+      %1 = vm.ext.i32.i64.s %0 : i32 -> i64
+      vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref<!hal.device>)
+    ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
+      %rnz = vm.cmp.nz.ref %4 : !vm.ref<!hal.device>
+      %5 = vm.xor.i32 %rnz, %c1 : i32
+      %slt = vm.cmp.lt.i64.s %2, %1 : i64
+      %6 = vm.and.i32 %5, %slt : i32
+      vm.cond_br %6, ^bb2, ^bb5
+    ^bb2:  // pred: ^bb1
+      %7 = vm.trunc.i64.i32 %2 : i64 -> i32
+      %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
+      %_utf8_hal_device_id_C6650FF277232B5A = vm.const.ref.rodata @_utf8_hal_device_id_C6650FF277232B5A : !vm.buffer
+      %_utf8_local_1A8FF0278D7661D8 = vm.const.ref.rodata @_utf8_local_1A8FF0278D7661D8 : !vm.buffer
+      %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_C6650FF277232B5A, %_utf8_local_1A8FF0278D7661D8) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
+      %nz = vm.cmp.nz.i64 %8#1 : i64
+      %9 = vm.select.i32 %8#0, %nz, %zero : i32
+      vm.cond_br %9, ^bb3, ^bb4(%zero : i32)
+    ^bb3:  // pred: ^bb2
+      %_utf8_hal_executable_format_E03EECB63A2AAF52 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer
+      %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer
+      %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_E03EECB63A2AAF52, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
+      %nz_3 = vm.cmp.nz.i64 %10#1 : i64
+      %11 = vm.select.i32 %10#0, %nz_3, %zero : i32
+      vm.br ^bb4(%11 : i32)
+    ^bb4(%12: i32):  // 2 preds: ^bb2, ^bb3
+      %eq = vm.cmp.eq.i64 %3, %zero_0 : i64
+      %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64
+      %14 = vm.add.i64 %3, %13 : i64
+      %15 = vm.and.i32 %12, %eq : i32
+      %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref<!hal.device>
+      %16 = vm.add.i64 %2, %c1_1 : i64
+      vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref<!hal.device>)
+    ^bb5:  // pred: ^bb1
+      vm.cond_br %5, ^bb6, ^bb7
+    ^bb6:  // pred: ^bb5
+      vm.fail %c18, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>"
+    ^bb7:  // pred: ^bb5
+      %_utf8_hal_executable_format_E03EECB63A2AAF52_5 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer
+      %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer
+      %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_E03EECB63A2AAF52_5, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
+      %nz_7 = vm.cmp.nz.i64 %17#1 : i64
+      %18 = vm.select.i32 %17#0, %nz_7, %zero : i32
+      %19 = vm.select.i64 %18, %zero_0, %c-1 : i64
+      %eq_8 = vm.cmp.eq.i64 %19, %zero_0 : i64
+      vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
+      vm.cond_br %eq_8, ^bb8, ^bb9
+    ^bb8:  // pred: ^bb7
+      %multiple_results_dispatch_0_embedded_elf_arm_64 = vm.const.ref.rodata @multiple_results_dispatch_0_embedded_elf_arm_64 : !vm.buffer
+      %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_9 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer
+      %ref_10 = vm.call @hal.executable.create(%4, %c-1, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_9, %multiple_results_dispatch_0_embedded_elf_arm_64, %null) {nosideeffects} : (!vm.ref<!hal.device>, i64, !vm.buffer, !vm.buffer, !vm.buffer) -> !vm.ref<!hal.executable>
+      vm.global.store.ref %ref_10, @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref<!hal.executable>
+      %ref_11 = vm.call @__multiple_results_memoize_apply() : () -> !vm.ref<!hal.command_buffer>
+      vm.global.store.ref %ref_11, @__multiple_results_memoize_result_0_device_0 : !vm.ref<!hal.command_buffer>
+      vm.return
+    ^bb9:  // pred: ^bb7
+      vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+    }
+    vm.func private @__multiple_results_memoize_apply() -> !vm.ref<!hal.command_buffer> attributes {inlining_policy = #util.inline.never} {
+      %c13 = vm.const.i32 13
+      %c28 = vm.const.i32 28
+      %c2 = vm.const.i32 2
+      %null = vm.const.ref.zero : !vm.ref<!hal.buffer>
+      %c1 = vm.const.i32 1
+      %c3 = vm.const.i32 3
+      %c64 = vm.const.i32 64
+      %c128 = vm.const.i64 128
+      %c8 = vm.const.i64 8
+      %zero = vm.const.i64.zero
+      %zero_0 = vm.const.i32.zero
+      %c-1 = vm.const.i64 -1
+      %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref<!hal.device>
+      %__device_0_executable_0_multiple_results_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref<!hal.executable>
+      %ref = vm.call @hal.command_buffer.create(%__device_0, %zero_0, %c3, %c-1, %c3) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
+      vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%zero_0], [(%zero_0, %zero_0, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64, i32 ..., tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
+      vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%c64], [(%zero_0, %c1, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64, i32 ..., tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
+      vm.call @hal.command_buffer.execution_barrier(%ref, %c28, %c13, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i64) -> ()
+      vm.call @hal.command_buffer.finalize(%ref) : (!vm.ref<!hal.command_buffer>) -> ()
+      vm.return %ref : !vm.ref<!hal.command_buffer>
+    }
+    vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32)
+    vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects}
+    vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...)
+    vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
+    vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 6 : i32}
+    vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>)
+    vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i64)
+    vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
+    vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects}
+    vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects}
+    vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64, %flags : i64) -> !vm.ref<!hal.buffer>
+    vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffer : !vm.ref<!hal.command_buffer>, %flags : i64, %binding_table : tuple<!vm.ref<!hal.buffer>, i64, i64> ...)
+    vm.import private @hal.devices.count() -> i32 attributes {nosideeffects}
+    vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects}
+    vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref<!hal.executable> attributes {nosideeffects}
+    vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i64) -> !vm.ref<!hal.fence>
+    vm.import private @hal.fence.await(%timeout_millis : i32, %flags : i64, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {vm.yield}
+    vm.rodata private @_utf8_input0_DCE99660CEB3F6B {alignment = 1 : i64} "input0"
+    vm.rodata private @_utf8_tensor_FC1814BC4A58F22A {alignment = 1 : i64} "tensor"
+    vm.rodata private @_utf8_input1_B898B726583C85DA {alignment = 1 : i64} "input1"
+    vm.func private @multiple_results(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> (!vm.ref<!hal.buffer_view>, !vm.ref<!hal.buffer_view>) attributes {iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+      %c16 = vm.const.i32 16
+      %c1 = vm.const.i32 1
+      %c553648160 = vm.const.i32 553648160
+      %c3075 = vm.const.i32 3075
+      %c48 = vm.const.i32 48
+      %c2 = vm.const.i64 2
+      %c8 = vm.const.i64 8
+      %c64 = vm.const.i64 64
+      %c128 = vm.const.i64 128
+      %zero = vm.const.i64.zero
+      %c-1 = vm.const.i64 -1
+      %null = vm.const.ref.zero : !vm.ref<!hal.fence>
+      %c-1_0 = vm.const.i32 -1
+      %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref<!hal.device>
+      %__multiple_results_memoize_result_0_device_0 = vm.global.load.ref immutable @__multiple_results_memoize_result_0_device_0 : !vm.ref<!hal.command_buffer>
+      %_utf8_input0_DCE99660CEB3F6B = vm.const.ref.rodata @_utf8_input0_DCE99660CEB3F6B : !vm.buffer
+      vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DCE99660CEB3F6B, %c553648160, %c1, [%c2]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
+      %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
+      %ref_1 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
+      %_utf8_tensor_FC1814BC4A58F22A = vm.const.ref.rodata @_utf8_tensor_FC1814BC4A58F22A : !vm.buffer
+      vm.call @hal.buffer.assert(%ref, %_utf8_tensor_FC1814BC4A58F22A, %ref_1, %c8, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
+      %_utf8_input1_B898B726583C85DA = vm.const.ref.rodata @_utf8_input1_B898B726583C85DA : !vm.buffer
+      vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_B898B726583C85DA, %c553648160, %c1, [%c2]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
+      %ref_2 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
+      %_utf8_tensor_FC1814BC4A58F22A_3 = vm.const.ref.rodata @_utf8_tensor_FC1814BC4A58F22A : !vm.buffer
+      vm.call @hal.buffer.assert(%ref_2, %_utf8_tensor_FC1814BC4A58F22A_3, %ref_1, %c8, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
+      %ref_4 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i64) -> !vm.ref<!hal.fence>
+      %ref_5 = vm.call @hal.device.queue.alloca(%__device_0, %c-1, %null, %ref_4, %zero, %c48, %c3075, %c128, %zero) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i64, i32, i32, i64, i64) -> !vm.ref<!hal.buffer>
+      %ref_6 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i64) -> !vm.ref<!hal.fence>
+      vm.call.variadic @hal.device.queue.execute.indirect(%__device_0, %c-1, %ref_4, %ref_6, %__multiple_results_memoize_result_0_device_0, %zero, [(%ref, %zero, %c8), (%ref_2, %zero, %c8), (%ref_5, %zero, %c128)]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer>, i64, tuple<!vm.ref<!hal.buffer>, i64, i64> ...)
+      %0 = vm.call.variadic @hal.fence.await(%c-1_0, %zero, [%ref_6]) : (i32, i64, !vm.ref<!hal.fence> ...) -> i32
+      vm.cond_br %0, ^bb2(%0 : i32), ^bb1
+    ^bb1:  // pred: ^bb0
+      %ref_7 = vm.call.variadic @hal.buffer_view.create(%ref_5, %zero, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
+      %ref_8 = vm.call.variadic @hal.buffer_view.create(%ref_5, %c64, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
+      vm.return %ref_7, %ref_8 : !vm.ref<!hal.buffer_view>, !vm.ref<!hal.buffer_view>
+    ^bb2(%1: i32):  // pred: ^bb0
+      vm.fail %1, "failed to wait on timepoint"
+    }
+    vm.export @multiple_results attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}}
+  }
+}
+
+
+// -----// IR Dump After DropUnusedCallsPass (iree-vm-drop-unused-calls) //----- //
+vm.module public @module {
+  vm.global.ref private @__device_0 : !vm.ref<!hal.device>
+  vm.global.ref private @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref<!hal.executable>
+  vm.global.ref private @__multiple_results_memoize_result_0_device_0 : !vm.ref<!hal.command_buffer>
+  vm.rodata private @_utf8_hal_device_id_C6650FF277232B5A {alignment = 1 : i64} "hal.device.id"
+  vm.rodata private @_utf8_local_1A8FF0278D7661D8 {alignment = 1 : i64} "local*"
+  vm.rodata private @_utf8_hal_executable_format_E03EECB63A2AAF52 {alignment = 1 : i64} "hal.executable.format"
+  vm.rodata private @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 {alignment = 1 : i64} "embedded-elf-arm_64"
+  vm.rodata private @multiple_results_dispatch_0_embedded_elf_arm_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8>
+  vm.initializer {
+    %c1 = vm.const.i32 1
+    %null = vm.const.ref.zero : !vm.buffer
+    %c14 = vm.const.i32 14
+    %c-1 = vm.const.i64 -1
+    %c18 = vm.const.i32 18
+    %zero = vm.const.i32.zero
+    %zero_0 = vm.const.i64.zero
+    %c1_1 = vm.const.i64 1
+    %null_2 = vm.const.ref.zero : !vm.ref<!hal.device>
+    %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
+    %1 = vm.ext.i32.i64.s %0 : i32 -> i64
+    vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref<!hal.device>)
+  ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
+    %rnz = vm.cmp.nz.ref %4 : !vm.ref<!hal.device>
+    %5 = vm.xor.i32 %rnz, %c1 : i32
+    %slt = vm.cmp.lt.i64.s %2, %1 : i64
+    %6 = vm.and.i32 %5, %slt : i32
+    vm.cond_br %6, ^bb2, ^bb5
+  ^bb2:  // pred: ^bb1
+    %7 = vm.trunc.i64.i32 %2 : i64 -> i32
+    %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
+    %_utf8_hal_device_id_C6650FF277232B5A = vm.const.ref.rodata @_utf8_hal_device_id_C6650FF277232B5A : !vm.buffer
+    %_utf8_local_1A8FF0278D7661D8 = vm.const.ref.rodata @_utf8_local_1A8FF0278D7661D8 : !vm.buffer
+    %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_C6650FF277232B5A, %_utf8_local_1A8FF0278D7661D8) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
+    %nz = vm.cmp.nz.i64 %8#1 : i64
+    %9 = vm.select.i32 %8#0, %nz, %zero : i32
+    vm.cond_br %9, ^bb3, ^bb4(%zero : i32)
+  ^bb3:  // pred: ^bb2
+    %_utf8_hal_executable_format_E03EECB63A2AAF52 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer
+    %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer
+    %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_E03EECB63A2AAF52, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
+    %nz_3 = vm.cmp.nz.i64 %10#1 : i64
+    %11 = vm.select.i32 %10#0, %nz_3, %zero : i32
+    vm.br ^bb4(%11 : i32)
+  ^bb4(%12: i32):  // 2 preds: ^bb2, ^bb3
+    %eq = vm.cmp.eq.i64 %3, %zero_0 : i64
+    %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64
+    %14 = vm.add.i64 %3, %13 : i64
+    %15 = vm.and.i32 %12, %eq : i32
+    %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref<!hal.device>
+    %16 = vm.add.i64 %2, %c1_1 : i64
+    vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref<!hal.device>)
+  ^bb5:  // pred: ^bb1
+    vm.cond_br %5, ^bb6, ^bb7
+  ^bb6:  // pred: ^bb5
+    vm.fail %c18, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>"
+  ^bb7:  // pred: ^bb5
+    %_utf8_hal_executable_format_E03EECB63A2AAF52_5 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer
+    %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer
+    %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_E03EECB63A2AAF52_5, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
+    %nz_7 = vm.cmp.nz.i64 %17#1 : i64
+    %18 = vm.select.i32 %17#0, %nz_7, %zero : i32
+    %19 = vm.select.i64 %18, %zero_0, %c-1 : i64
+    %eq_8 = vm.cmp.eq.i64 %19, %zero_0 : i64
+    vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
+    vm.cond_br %eq_8, ^bb8, ^bb9
+  ^bb8:  // pred: ^bb7
+    %multiple_results_dispatch_0_embedded_elf_arm_64 = vm.const.ref.rodata @multiple_results_dispatch_0_embedded_elf_arm_64 : !vm.buffer
+    %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_9 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer
+    %ref_10 = vm.call @hal.executable.create(%4, %c-1, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_9, %multiple_results_dispatch_0_embedded_elf_arm_64, %null) {nosideeffects} : (!vm.ref<!hal.device>, i64, !vm.buffer, !vm.buffer, !vm.buffer) -> !vm.ref<!hal.executable>
+    vm.global.store.ref %ref_10, @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref<!hal.executable>
+    %ref_11 = vm.call @__multiple_results_memoize_apply() : () -> !vm.ref<!hal.command_buffer>
+    vm.global.store.ref %ref_11, @__multiple_results_memoize_result_0_device_0 : !vm.ref<!hal.command_buffer>
+    vm.return
+  ^bb9:  // pred: ^bb7
+    vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+  }
+  vm.func private @__multiple_results_memoize_apply() -> !vm.ref<!hal.command_buffer> attributes {inlining_policy = #util.inline.never} {
+    %c13 = vm.const.i32 13
+    %c28 = vm.const.i32 28
+    %c2 = vm.const.i32 2
+    %null = vm.const.ref.zero : !vm.ref<!hal.buffer>
+    %c1 = vm.const.i32 1
+    %c3 = vm.const.i32 3
+    %c64 = vm.const.i32 64
+    %c128 = vm.const.i64 128
+    %c8 = vm.const.i64 8
+    %zero = vm.const.i64.zero
+    %zero_0 = vm.const.i32.zero
+    %c-1 = vm.const.i64 -1
+    %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref<!hal.device>
+    %__device_0_executable_0_multiple_results_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref<!hal.executable>
+    %ref = vm.call @hal.command_buffer.create(%__device_0, %zero_0, %c3, %c-1, %c3) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
+    vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%zero_0], [(%zero_0, %zero_0, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64, i32 ..., tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
+    vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%c64], [(%zero_0, %c1, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64, i32 ..., tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
+    vm.call @hal.command_buffer.execution_barrier(%ref, %c28, %c13, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i64) -> ()
+    vm.call @hal.command_buffer.finalize(%ref) : (!vm.ref<!hal.command_buffer>) -> ()
+    vm.return %ref : !vm.ref<!hal.command_buffer>
+  }
+  vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32)
+  vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects}
+  vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...)
+  vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
+  vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 6 : i32}
+  vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>)
+  vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i64)
+  vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
+  vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects}
+  vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects}
+  vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64, %flags : i64) -> !vm.ref<!hal.buffer>
+  vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffer : !vm.ref<!hal.command_buffer>, %flags : i64, %binding_table : tuple<!vm.ref<!hal.buffer>, i64, i64> ...)
+  vm.import private @hal.devices.count() -> i32 attributes {nosideeffects}
+  vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects}
+  vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref<!hal.executable> attributes {nosideeffects}
+  vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i64) -> !vm.ref<!hal.fence>
+  vm.import private @hal.fence.await(%timeout_millis : i32, %flags : i64, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {vm.yield}
+  vm.rodata private @_utf8_input0_DCE99660CEB3F6B {alignment = 1 : i64} "input0"
+  vm.rodata private @_utf8_tensor_FC1814BC4A58F22A {alignment = 1 : i64} "tensor"
+  vm.rodata private @_utf8_input1_B898B726583C85DA {alignment = 1 : i64} "input1"
+  vm.func private @multiple_results(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> (!vm.ref<!hal.buffer_view>, !vm.ref<!hal.buffer_view>) attributes {iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c16 = vm.const.i32 16
+    %c1 = vm.const.i32 1
+    %c553648160 = vm.const.i32 553648160
+    %c3075 = vm.const.i32 3075
+    %c48 = vm.const.i32 48
+    %c2 = vm.const.i64 2
+    %c8 = vm.const.i64 8
+    %c64 = vm.const.i64 64
+    %c128 = vm.const.i64 128
+    %zero = vm.const.i64.zero
+    %c-1 = vm.const.i64 -1
+    %null = vm.const.ref.zero : !vm.ref<!hal.fence>
+    %c-1_0 = vm.const.i32 -1
+    %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref<!hal.device>
+    %__multiple_results_memoize_result_0_device_0 = vm.global.load.ref immutable @__multiple_results_memoize_result_0_device_0 : !vm.ref<!hal.command_buffer>
+    %_utf8_input0_DCE99660CEB3F6B = vm.const.ref.rodata @_utf8_input0_DCE99660CEB3F6B : !vm.buffer
+    vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DCE99660CEB3F6B, %c553648160, %c1, [%c2]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
+    %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
+    %ref_1 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
+    %_utf8_tensor_FC1814BC4A58F22A = vm.const.ref.rodata @_utf8_tensor_FC1814BC4A58F22A : !vm.buffer
+    vm.call @hal.buffer.assert(%ref, %_utf8_tensor_FC1814BC4A58F22A, %ref_1, %c8, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
+    %_utf8_input1_B898B726583C85DA = vm.const.ref.rodata @_utf8_input1_B898B726583C85DA : !vm.buffer
+    vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_B898B726583C85DA, %c553648160, %c1, [%c2]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
+    %ref_2 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
+    %_utf8_tensor_FC1814BC4A58F22A_3 = vm.const.ref.rodata @_utf8_tensor_FC1814BC4A58F22A : !vm.buffer
+    vm.call @hal.buffer.assert(%ref_2, %_utf8_tensor_FC1814BC4A58F22A_3, %ref_1, %c8, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
+    %ref_4 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i64) -> !vm.ref<!hal.fence>
+    %ref_5 = vm.call @hal.device.queue.alloca(%__device_0, %c-1, %null, %ref_4, %zero, %c48, %c3075, %c128, %zero) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i64, i32, i32, i64, i64) -> !vm.ref<!hal.buffer>
+    %ref_6 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i64) -> !vm.ref<!hal.fence>
+    vm.call.variadic @hal.device.queue.execute.indirect(%__device_0, %c-1, %ref_4, %ref_6, %__multiple_results_memoize_result_0_device_0, %zero, [(%ref, %zero, %c8), (%ref_2, %zero, %c8), (%ref_5, %zero, %c128)]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer>, i64, tuple<!vm.ref<!hal.buffer>, i64, i64> ...)
+    %0 = vm.call.variadic @hal.fence.await(%c-1_0, %zero, [%ref_6]) : (i32, i64, !vm.ref<!hal.fence> ...) -> i32
+    vm.cond_br %0, ^bb2, ^bb1
+  ^bb1:  // pred: ^bb0
+    %ref_7 = vm.call.variadic @hal.buffer_view.create(%ref_5, %zero, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
+    %ref_8 = vm.call.variadic @hal.buffer_view.create(%ref_5, %c64, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
+    vm.return %ref_7, %ref_8 : !vm.ref<!hal.buffer_view>, !vm.ref<!hal.buffer_view>
+  ^bb2:  // pred: ^bb0
+    vm.fail %0, "failed to wait on timepoint"
+  }
+  vm.export @multiple_results attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}}
+}
+
+// -----// IR Dump After SymbolDCE (symbol-dce) //----- //
+module attributes {vm.toplevel} {
+  vm.module public @module {
+    vm.global.ref private @__device_0 : !vm.ref<!hal.device>
+    vm.global.ref private @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref<!hal.executable>
+    vm.global.ref private @__multiple_results_memoize_result_0_device_0 : !vm.ref<!hal.command_buffer>
+    vm.rodata private @_utf8_hal_device_id_C6650FF277232B5A {alignment = 1 : i64} "hal.device.id"
+    vm.rodata private @_utf8_local_1A8FF0278D7661D8 {alignment = 1 : i64} "local*"
+    vm.rodata private @_utf8_hal_executable_format_E03EECB63A2AAF52 {alignment = 1 : i64} "hal.executable.format"
+    vm.rodata private @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 {alignment = 1 : i64} "embedded-elf-arm_64"
+    vm.rodata private @multiple_results_dispatch_0_embedded_elf_arm_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8>
+    vm.initializer {
+      %c1 = vm.const.i32 1
+      %null = vm.const.ref.zero : !vm.buffer
+      %c14 = vm.const.i32 14
+      %c-1 = vm.const.i64 -1
+      %c18 = vm.const.i32 18
+      %zero = vm.const.i32.zero
+      %zero_0 = vm.const.i64.zero
+      %c1_1 = vm.const.i64 1
+      %null_2 = vm.const.ref.zero : !vm.ref<!hal.device>
+      %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
+      %1 = vm.ext.i32.i64.s %0 : i32 -> i64
+      vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref<!hal.device>)
+    ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
+      %rnz = vm.cmp.nz.ref %4 : !vm.ref<!hal.device>
+      %5 = vm.xor.i32 %rnz, %c1 : i32
+      %slt = vm.cmp.lt.i64.s %2, %1 : i64
+      %6 = vm.and.i32 %5, %slt : i32
+      vm.cond_br %6, ^bb2, ^bb5
+    ^bb2:  // pred: ^bb1
+      %7 = vm.trunc.i64.i32 %2 : i64 -> i32
+      %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
+      %_utf8_hal_device_id_C6650FF277232B5A = vm.const.ref.rodata @_utf8_hal_device_id_C6650FF277232B5A : !vm.buffer
+      %_utf8_local_1A8FF0278D7661D8 = vm.const.ref.rodata @_utf8_local_1A8FF0278D7661D8 : !vm.buffer
+      %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_C6650FF277232B5A, %_utf8_local_1A8FF0278D7661D8) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
+      %nz = vm.cmp.nz.i64 %8#1 : i64
+      %9 = vm.select.i32 %8#0, %nz, %zero : i32
+      vm.cond_br %9, ^bb3, ^bb4(%zero : i32)
+    ^bb3:  // pred: ^bb2
+      %_utf8_hal_executable_format_E03EECB63A2AAF52 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer
+      %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer
+      %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_E03EECB63A2AAF52, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
+      %nz_3 = vm.cmp.nz.i64 %10#1 : i64
+      %11 = vm.select.i32 %10#0, %nz_3, %zero : i32
+      vm.br ^bb4(%11 : i32)
+    ^bb4(%12: i32):  // 2 preds: ^bb2, ^bb3
+      %eq = vm.cmp.eq.i64 %3, %zero_0 : i64
+      %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64
+      %14 = vm.add.i64 %3, %13 : i64
+      %15 = vm.and.i32 %12, %eq : i32
+      %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref<!hal.device>
+      %16 = vm.add.i64 %2, %c1_1 : i64
+      vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref<!hal.device>)
+    ^bb5:  // pred: ^bb1
+      vm.cond_br %5, ^bb6, ^bb7
+    ^bb6:  // pred: ^bb5
+      vm.fail %c18, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>"
+    ^bb7:  // pred: ^bb5
+      %_utf8_hal_executable_format_E03EECB63A2AAF52_5 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer
+      %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer
+      %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_E03EECB63A2AAF52_5, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
+      %nz_7 = vm.cmp.nz.i64 %17#1 : i64
+      %18 = vm.select.i32 %17#0, %nz_7, %zero : i32
+      %19 = vm.select.i64 %18, %zero_0, %c-1 : i64
+      %eq_8 = vm.cmp.eq.i64 %19, %zero_0 : i64
+      vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
+      vm.cond_br %eq_8, ^bb8, ^bb9
+    ^bb8:  // pred: ^bb7
+      %multiple_results_dispatch_0_embedded_elf_arm_64 = vm.const.ref.rodata @multiple_results_dispatch_0_embedded_elf_arm_64 : !vm.buffer
+      %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_9 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer
+      %ref_10 = vm.call @hal.executable.create(%4, %c-1, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_9, %multiple_results_dispatch_0_embedded_elf_arm_64, %null) {nosideeffects} : (!vm.ref<!hal.device>, i64, !vm.buffer, !vm.buffer, !vm.buffer) -> !vm.ref<!hal.executable>
+      vm.global.store.ref %ref_10, @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref<!hal.executable>
+      %ref_11 = vm.call @__multiple_results_memoize_apply() : () -> !vm.ref<!hal.command_buffer>
+      vm.global.store.ref %ref_11, @__multiple_results_memoize_result_0_device_0 : !vm.ref<!hal.command_buffer>
+      vm.return
+    ^bb9:  // pred: ^bb7
+      vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+    }
+    vm.func private @__multiple_results_memoize_apply() -> !vm.ref<!hal.command_buffer> attributes {inlining_policy = #util.inline.never} {
+      %c13 = vm.const.i32 13
+      %c28 = vm.const.i32 28
+      %c2 = vm.const.i32 2
+      %null = vm.const.ref.zero : !vm.ref<!hal.buffer>
+      %c1 = vm.const.i32 1
+      %c3 = vm.const.i32 3
+      %c64 = vm.const.i32 64
+      %c128 = vm.const.i64 128
+      %c8 = vm.const.i64 8
+      %zero = vm.const.i64.zero
+      %zero_0 = vm.const.i32.zero
+      %c-1 = vm.const.i64 -1
+      %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref<!hal.device>
+      %__device_0_executable_0_multiple_results_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref<!hal.executable>
+      %ref = vm.call @hal.command_buffer.create(%__device_0, %zero_0, %c3, %c-1, %c3) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
+      vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%zero_0], [(%zero_0, %zero_0, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64, i32 ..., tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
+      vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%c64], [(%zero_0, %c1, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64, i32 ..., tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
+      vm.call @hal.command_buffer.execution_barrier(%ref, %c28, %c13, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i64) -> ()
+      vm.call @hal.command_buffer.finalize(%ref) : (!vm.ref<!hal.command_buffer>) -> ()
+      vm.return %ref : !vm.ref<!hal.command_buffer>
+    }
+    vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32)
+    vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects}
+    vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...)
+    vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
+    vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 6 : i32}
+    vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>)
+    vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i64)
+    vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
+    vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects}
+    vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects}
+    vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64, %flags : i64) -> !vm.ref<!hal.buffer>
+    vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffer : !vm.ref<!hal.command_buffer>, %flags : i64, %binding_table : tuple<!vm.ref<!hal.buffer>, i64, i64> ...)
+    vm.import private @hal.devices.count() -> i32 attributes {nosideeffects}
+    vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects}
+    vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref<!hal.executable> attributes {nosideeffects}
+    vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i64) -> !vm.ref<!hal.fence>
+    vm.import private @hal.fence.await(%timeout_millis : i32, %flags : i64, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {vm.yield}
+    vm.rodata private @_utf8_input0_DCE99660CEB3F6B {alignment = 1 : i64} "input0"
+    vm.rodata private @_utf8_tensor_FC1814BC4A58F22A {alignment = 1 : i64} "tensor"
+    vm.rodata private @_utf8_input1_B898B726583C85DA {alignment = 1 : i64} "input1"
+    vm.func private @multiple_results(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> (!vm.ref<!hal.buffer_view>, !vm.ref<!hal.buffer_view>) attributes {iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+      %c16 = vm.const.i32 16
+      %c1 = vm.const.i32 1
+      %c553648160 = vm.const.i32 553648160
+      %c3075 = vm.const.i32 3075
+      %c48 = vm.const.i32 48
+      %c2 = vm.const.i64 2
+      %c8 = vm.const.i64 8
+      %c64 = vm.const.i64 64
+      %c128 = vm.const.i64 128
+      %zero = vm.const.i64.zero
+      %c-1 = vm.const.i64 -1
+      %null = vm.const.ref.zero : !vm.ref<!hal.fence>
+      %c-1_0 = vm.const.i32 -1
+      %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref<!hal.device>
+      %__multiple_results_memoize_result_0_device_0 = vm.global.load.ref immutable @__multiple_results_memoize_result_0_device_0 : !vm.ref<!hal.command_buffer>
+      %_utf8_input0_DCE99660CEB3F6B = vm.const.ref.rodata @_utf8_input0_DCE99660CEB3F6B : !vm.buffer
+      vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DCE99660CEB3F6B, %c553648160, %c1, [%c2]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
+      %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
+      %ref_1 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
+      %_utf8_tensor_FC1814BC4A58F22A = vm.const.ref.rodata @_utf8_tensor_FC1814BC4A58F22A : !vm.buffer
+      vm.call @hal.buffer.assert(%ref, %_utf8_tensor_FC1814BC4A58F22A, %ref_1, %c8, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
+      %_utf8_input1_B898B726583C85DA = vm.const.ref.rodata @_utf8_input1_B898B726583C85DA : !vm.buffer
+      vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_B898B726583C85DA, %c553648160, %c1, [%c2]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
+      %ref_2 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
+      %_utf8_tensor_FC1814BC4A58F22A_3 = vm.const.ref.rodata @_utf8_tensor_FC1814BC4A58F22A : !vm.buffer
+      vm.call @hal.buffer.assert(%ref_2, %_utf8_tensor_FC1814BC4A58F22A_3, %ref_1, %c8, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
+      %ref_4 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i64) -> !vm.ref<!hal.fence>
+      %ref_5 = vm.call @hal.device.queue.alloca(%__device_0, %c-1, %null, %ref_4, %zero, %c48, %c3075, %c128, %zero) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i64, i32, i32, i64, i64) -> !vm.ref<!hal.buffer>
+      %ref_6 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i64) -> !vm.ref<!hal.fence>
+      vm.call.variadic @hal.device.queue.execute.indirect(%__device_0, %c-1, %ref_4, %ref_6, %__multiple_results_memoize_result_0_device_0, %zero, [(%ref, %zero, %c8), (%ref_2, %zero, %c8), (%ref_5, %zero, %c128)]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer>, i64, tuple<!vm.ref<!hal.buffer>, i64, i64> ...)
+      %0 = vm.call.variadic @hal.fence.await(%c-1_0, %zero, [%ref_6]) : (i32, i64, !vm.ref<!hal.fence> ...) -> i32
+      vm.cond_br %0, ^bb2, ^bb1
+    ^bb1:  // pred: ^bb0
+      %ref_7 = vm.call.variadic @hal.buffer_view.create(%ref_5, %zero, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
+      %ref_8 = vm.call.variadic @hal.buffer_view.create(%ref_5, %c64, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
+      vm.return %ref_7, %ref_8 : !vm.ref<!hal.buffer_view>, !vm.ref<!hal.buffer_view>
+    ^bb2:  // pred: ^bb0
+      vm.fail %0, "failed to wait on timepoint"
+    }
+    vm.export @multiple_results attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}}
+  }
+}
+
+
+// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- //
+module attributes {vm.toplevel} {
+  vm.module public @module {
+    vm.global.ref private @__device_0 : !vm.ref<!hal.device>
+    vm.global.ref private @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref<!hal.executable>
+    vm.global.ref private @__multiple_results_memoize_result_0_device_0 : !vm.ref<!hal.command_buffer>
+    vm.rodata private @_utf8_hal_device_id_C6650FF277232B5A {alignment = 1 : i64} "hal.device.id"
+    vm.rodata private @_utf8_local_1A8FF0278D7661D8 {alignment = 1 : i64} "local*"
+    vm.rodata private @_utf8_hal_executable_format_E03EECB63A2AAF52 {alignment = 1 : i64} "hal.executable.format"
+    vm.rodata private @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 {alignment = 1 : i64} "embedded-elf-arm_64"
+    vm.rodata private @multiple_results_dispatch_0_embedded_elf_arm_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8>
+    vm.initializer {
+      %c1 = vm.const.i32 1
+      %null = vm.const.ref.zero : !vm.buffer
+      %c14 = vm.const.i32 14
+      %c-1 = vm.const.i64 -1
+      %c18 = vm.const.i32 18
+      %zero = vm.const.i32.zero
+      %zero_0 = vm.const.i64.zero
+      %c1_1 = vm.const.i64 1
+      %null_2 = vm.const.ref.zero : !vm.ref<!hal.device>
+      %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
+      %1 = vm.ext.i32.i64.s %0 : i32 -> i64
+      vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref<!hal.device>)
+    ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
+      %rnz = vm.cmp.nz.ref %4 : !vm.ref<!hal.device>
+      %5 = vm.xor.i32 %rnz, %c1 : i32
+      %slt = vm.cmp.lt.i64.s %2, %1 : i64
+      %6 = vm.and.i32 %5, %slt : i32
+      vm.cond_br %6, ^bb2, ^bb5
+    ^bb2:  // pred: ^bb1
+      %7 = vm.trunc.i64.i32 %2 : i64 -> i32
+      %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
+      %_utf8_hal_device_id_C6650FF277232B5A = vm.const.ref.rodata @_utf8_hal_device_id_C6650FF277232B5A : !vm.buffer
+      %_utf8_local_1A8FF0278D7661D8 = vm.const.ref.rodata @_utf8_local_1A8FF0278D7661D8 : !vm.buffer
+      %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_C6650FF277232B5A, %_utf8_local_1A8FF0278D7661D8) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
+      %nz = vm.cmp.nz.i64 %8#1 : i64
+      %9 = vm.select.i32 %8#0, %nz, %zero : i32
+      vm.cond_br %9, ^bb3, ^bb4(%zero : i32)
+    ^bb3:  // pred: ^bb2
+      %_utf8_hal_executable_format_E03EECB63A2AAF52 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer
+      %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer
+      %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_E03EECB63A2AAF52, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
+      %nz_3 = vm.cmp.nz.i64 %10#1 : i64
+      %11 = vm.select.i32 %10#0, %nz_3, %zero : i32
+      vm.br ^bb4(%11 : i32)
+    ^bb4(%12: i32):  // 2 preds: ^bb2, ^bb3
+      %eq = vm.cmp.eq.i64 %3, %zero_0 : i64
+      %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64
+      %14 = vm.add.i64 %3, %13 : i64
+      %15 = vm.and.i32 %12, %eq : i32
+      %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref<!hal.device>
+      %16 = vm.add.i64 %2, %c1_1 : i64
+      vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref<!hal.device>)
+    ^bb5:  // pred: ^bb1
+      vm.cond_br %5, ^bb6, ^bb7
+    ^bb6:  // pred: ^bb5
+      vm.fail %c18, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>"
+    ^bb7:  // pred: ^bb5
+      %_utf8_hal_executable_format_E03EECB63A2AAF52_5 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer
+      %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer
+      %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_E03EECB63A2AAF52_5, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
+      %nz_7 = vm.cmp.nz.i64 %17#1 : i64
+      %18 = vm.select.i32 %17#0, %nz_7, %zero : i32
+      %19 = vm.select.i64 %18, %zero_0, %c-1 : i64
+      %eq_8 = vm.cmp.eq.i64 %19, %zero_0 : i64
+      vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
+      vm.cond_br %eq_8, ^bb8, ^bb9
+    ^bb8:  // pred: ^bb7
+      %multiple_results_dispatch_0_embedded_elf_arm_64 = vm.const.ref.rodata @multiple_results_dispatch_0_embedded_elf_arm_64 : !vm.buffer
+      %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_9 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer
+      %ref_10 = vm.call @hal.executable.create(%4, %c-1, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_9, %multiple_results_dispatch_0_embedded_elf_arm_64, %null) {nosideeffects} : (!vm.ref<!hal.device>, i64, !vm.buffer, !vm.buffer, !vm.buffer) -> !vm.ref<!hal.executable>
+      vm.global.store.ref %ref_10, @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref<!hal.executable>
+      %ref_11 = vm.call @__multiple_results_memoize_apply() : () -> !vm.ref<!hal.command_buffer>
+      vm.global.store.ref %ref_11, @__multiple_results_memoize_result_0_device_0 : !vm.ref<!hal.command_buffer>
+      vm.return
+    ^bb9:  // pred: ^bb7
+      vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+    }
+    vm.func private @__multiple_results_memoize_apply() -> !vm.ref<!hal.command_buffer> attributes {inlining_policy = #util.inline.never} {
+      %c13 = vm.const.i32 13
+      %c28 = vm.const.i32 28
+      %c2 = vm.const.i32 2
+      %null = vm.const.ref.zero : !vm.ref<!hal.buffer>
+      %c1 = vm.const.i32 1
+      %c3 = vm.const.i32 3
+      %c64 = vm.const.i32 64
+      %c128 = vm.const.i64 128
+      %c8 = vm.const.i64 8
+      %zero = vm.const.i64.zero
+      %zero_0 = vm.const.i32.zero
+      %c-1 = vm.const.i64 -1
+      %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref<!hal.device>
+      %__device_0_executable_0_multiple_results_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref<!hal.executable>
+      %ref = vm.call @hal.command_buffer.create(%__device_0, %zero_0, %c3, %c-1, %c3) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
+      vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%zero_0], [(%zero_0, %zero_0, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64, i32 ..., tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
+      vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%c64], [(%zero_0, %c1, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64, i32 ..., tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
+      vm.call @hal.command_buffer.execution_barrier(%ref, %c28, %c13, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i64) -> ()
+      vm.call @hal.command_buffer.finalize(%ref) : (!vm.ref<!hal.command_buffer>) -> ()
+      vm.return %ref : !vm.ref<!hal.command_buffer>
+    }
+    vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32)
+    vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects}
+    vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...)
+    vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
+    vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 6 : i32}
+    vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>)
+    vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i64)
+    vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
+    vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects}
+    vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects}
+    vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64, %flags : i64) -> !vm.ref<!hal.buffer>
+    vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffer : !vm.ref<!hal.command_buffer>, %flags : i64, %binding_table : tuple<!vm.ref<!hal.buffer>, i64, i64> ...)
+    vm.import private @hal.devices.count() -> i32 attributes {nosideeffects}
+    vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects}
+    vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref<!hal.executable> attributes {nosideeffects}
+    vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i64) -> !vm.ref<!hal.fence>
+    vm.import private @hal.fence.await(%timeout_millis : i32, %flags : i64, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {vm.yield}
+    vm.rodata private @_utf8_input0_DCE99660CEB3F6B {alignment = 1 : i64} "input0"
+    vm.rodata private @_utf8_tensor_FC1814BC4A58F22A {alignment = 1 : i64} "tensor"
+    vm.rodata private @_utf8_input1_B898B726583C85DA {alignment = 1 : i64} "input1"
+    vm.func private @multiple_results(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> (!vm.ref<!hal.buffer_view>, !vm.ref<!hal.buffer_view>) attributes {iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+      %c16 = vm.const.i32 16
+      %c1 = vm.const.i32 1
+      %c553648160 = vm.const.i32 553648160
+      %c3075 = vm.const.i32 3075
+      %c48 = vm.const.i32 48
+      %c2 = vm.const.i64 2
+      %c8 = vm.const.i64 8
+      %c64 = vm.const.i64 64
+      %c128 = vm.const.i64 128
+      %zero = vm.const.i64.zero
+      %c-1 = vm.const.i64 -1
+      %null = vm.const.ref.zero : !vm.ref<!hal.fence>
+      %c-1_0 = vm.const.i32 -1
+      %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref<!hal.device>
+      %__multiple_results_memoize_result_0_device_0 = vm.global.load.ref immutable @__multiple_results_memoize_result_0_device_0 : !vm.ref<!hal.command_buffer>
+      %_utf8_input0_DCE99660CEB3F6B = vm.const.ref.rodata @_utf8_input0_DCE99660CEB3F6B : !vm.buffer
+      vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DCE99660CEB3F6B, %c553648160, %c1, [%c2]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
+      %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
+      %ref_1 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
+      %_utf8_tensor_FC1814BC4A58F22A = vm.const.ref.rodata @_utf8_tensor_FC1814BC4A58F22A : !vm.buffer
+      vm.call @hal.buffer.assert(%ref, %_utf8_tensor_FC1814BC4A58F22A, %ref_1, %c8, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
+      %_utf8_input1_B898B726583C85DA = vm.const.ref.rodata @_utf8_input1_B898B726583C85DA : !vm.buffer
+      vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_B898B726583C85DA, %c553648160, %c1, [%c2]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
+      %ref_2 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
+      %_utf8_tensor_FC1814BC4A58F22A_3 = vm.const.ref.rodata @_utf8_tensor_FC1814BC4A58F22A : !vm.buffer
+      vm.call @hal.buffer.assert(%ref_2, %_utf8_tensor_FC1814BC4A58F22A_3, %ref_1, %c8, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
+      %ref_4 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i64) -> !vm.ref<!hal.fence>
+      %ref_5 = vm.call @hal.device.queue.alloca(%__device_0, %c-1, %null, %ref_4, %zero, %c48, %c3075, %c128, %zero) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i64, i32, i32, i64, i64) -> !vm.ref<!hal.buffer>
+      %ref_6 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i64) -> !vm.ref<!hal.fence>
+      vm.call.variadic @hal.device.queue.execute.indirect(%__device_0, %c-1, %ref_4, %ref_6, %__multiple_results_memoize_result_0_device_0, %zero, [(%ref, %zero, %c8), (%ref_2, %zero, %c8), (%ref_5, %zero, %c128)]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer>, i64, tuple<!vm.ref<!hal.buffer>, i64, i64> ...)
+      %0 = vm.call.variadic @hal.fence.await(%c-1_0, %zero, [%ref_6]) : (i32, i64, !vm.ref<!hal.fence> ...) -> i32
+      vm.cond_br %0, ^bb2, ^bb1
+    ^bb1:  // pred: ^bb0
+      %ref_7 = vm.call.variadic @hal.buffer_view.create(%ref_5, %zero, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
+      %ref_8 = vm.call.variadic @hal.buffer_view.create(%ref_5, %c64, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
+      vm.return %ref_7, %ref_8 : !vm.ref<!hal.buffer_view>, !vm.ref<!hal.buffer_view>
+    ^bb2:  // pred: ^bb0
+      vm.fail %0, "failed to wait on timepoint"
+    }
+    vm.export @multiple_results attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}}
+  }
+}
+
+
+// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- //
+module attributes {vm.toplevel} {
+  vm.module public @module {
+    vm.global.ref private @__device_0 : !vm.ref<!hal.device>
+    vm.global.ref private @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref<!hal.executable>
+    vm.global.ref private @__multiple_results_memoize_result_0_device_0 : !vm.ref<!hal.command_buffer>
+    vm.rodata private @_utf8_hal_device_id_C6650FF277232B5A {alignment = 1 : i64} "hal.device.id"
+    vm.rodata private @_utf8_local_1A8FF0278D7661D8 {alignment = 1 : i64} "local*"
+    vm.rodata private @_utf8_hal_executable_format_E03EECB63A2AAF52 {alignment = 1 : i64} "hal.executable.format"
+    vm.rodata private @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 {alignment = 1 : i64} "embedded-elf-arm_64"
+    vm.rodata private @multiple_results_dispatch_0_embedded_elf_arm_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8>
+    vm.initializer {
+      %c1 = vm.const.i32 1
+      %null = vm.const.ref.zero : !vm.buffer
+      %c14 = vm.const.i32 14
+      %c-1 = vm.const.i64 -1
+      %c18 = vm.const.i32 18
+      %zero = vm.const.i32.zero
+      %zero_0 = vm.const.i64.zero
+      %c1_1 = vm.const.i64 1
+      %null_2 = vm.const.ref.zero : !vm.ref<!hal.device>
+      %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
+      %1 = vm.ext.i32.i64.s %0 : i32 -> i64
+      vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref<!hal.device>)
+    ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
+      %rnz = vm.cmp.nz.ref %4 : !vm.ref<!hal.device>
+      %5 = vm.xor.i32 %rnz, %c1 : i32
+      %slt = vm.cmp.lt.i64.s %2, %1 : i64
+      %6 = vm.and.i32 %5, %slt : i32
+      vm.cond_br %6, ^bb2, ^bb5
+    ^bb2:  // pred: ^bb1
+      %7 = vm.trunc.i64.i32 %2 : i64 -> i32
+      %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
+      %_utf8_hal_device_id_C6650FF277232B5A = vm.const.ref.rodata @_utf8_hal_device_id_C6650FF277232B5A : !vm.buffer
+      %_utf8_local_1A8FF0278D7661D8 = vm.const.ref.rodata @_utf8_local_1A8FF0278D7661D8 : !vm.buffer
+      %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_C6650FF277232B5A, %_utf8_local_1A8FF0278D7661D8) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
+      %nz = vm.cmp.nz.i64 %8#1 : i64
+      %9 = vm.select.i32 %8#0, %nz, %zero : i32
+      vm.cond_br %9, ^bb3, ^bb4(%zero : i32)
+    ^bb3:  // pred: ^bb2
+      %_utf8_hal_executable_format_E03EECB63A2AAF52 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer
+      %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer
+      %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_E03EECB63A2AAF52, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
+      %nz_3 = vm.cmp.nz.i64 %10#1 : i64
+      %11 = vm.select.i32 %10#0, %nz_3, %zero : i32
+      vm.br ^bb4(%11 : i32)
+    ^bb4(%12: i32):  // 2 preds: ^bb2, ^bb3
+      %eq = vm.cmp.eq.i64 %3, %zero_0 : i64
+      %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64
+      %14 = vm.add.i64 %3, %13 : i64
+      %15 = vm.and.i32 %12, %eq : i32
+      %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref<!hal.device>
+      %16 = vm.add.i64 %2, %c1_1 : i64
+      vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref<!hal.device>)
+    ^bb5:  // pred: ^bb1
+      vm.cond_br %5, ^bb6, ^bb7
+    ^bb6:  // pred: ^bb5
+      vm.fail %c18, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>"
+    ^bb7:  // pred: ^bb5
+      %_utf8_hal_executable_format_E03EECB63A2AAF52_5 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer
+      %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer
+      %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_E03EECB63A2AAF52_5, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
+      %nz_7 = vm.cmp.nz.i64 %17#1 : i64
+      %18 = vm.select.i32 %17#0, %nz_7, %zero : i32
+      %19 = vm.select.i64 %18, %zero_0, %c-1 : i64
+      %eq_8 = vm.cmp.eq.i64 %19, %zero_0 : i64
+      vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
+      vm.cond_br %eq_8, ^bb8, ^bb9
+    ^bb8:  // pred: ^bb7
+      %multiple_results_dispatch_0_embedded_elf_arm_64 = vm.const.ref.rodata @multiple_results_dispatch_0_embedded_elf_arm_64 : !vm.buffer
+      %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_9 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer
+      %ref_10 = vm.call @hal.executable.create(%4, %c-1, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_9, %multiple_results_dispatch_0_embedded_elf_arm_64, %null) {nosideeffects} : (!vm.ref<!hal.device>, i64, !vm.buffer, !vm.buffer, !vm.buffer) -> !vm.ref<!hal.executable>
+      vm.global.store.ref %ref_10, @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref<!hal.executable>
+      %ref_11 = vm.call @__multiple_results_memoize_apply() : () -> !vm.ref<!hal.command_buffer>
+      vm.global.store.ref %ref_11, @__multiple_results_memoize_result_0_device_0 : !vm.ref<!hal.command_buffer>
+      vm.return
+    ^bb9:  // pred: ^bb7
+      vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+    }
+    vm.func private @__multiple_results_memoize_apply() -> !vm.ref<!hal.command_buffer> attributes {inlining_policy = #util.inline.never} {
+      %c13 = vm.const.i32 13
+      %c28 = vm.const.i32 28
+      %c2 = vm.const.i32 2
+      %null = vm.const.ref.zero : !vm.ref<!hal.buffer>
+      %c1 = vm.const.i32 1
+      %c3 = vm.const.i32 3
+      %c64 = vm.const.i32 64
+      %c128 = vm.const.i64 128
+      %c8 = vm.const.i64 8
+      %zero = vm.const.i64.zero
+      %zero_0 = vm.const.i32.zero
+      %c-1 = vm.const.i64 -1
+      %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref<!hal.device>
+      %__device_0_executable_0_multiple_results_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref<!hal.executable>
+      %ref = vm.call @hal.command_buffer.create(%__device_0, %zero_0, %c3, %c-1, %c3) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
+      vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%zero_0], [(%zero_0, %zero_0, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64, i32 ..., tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
+      vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%c64], [(%zero_0, %c1, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64, i32 ..., tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
+      vm.call @hal.command_buffer.execution_barrier(%ref, %c28, %c13, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i64) -> ()
+      vm.call @hal.command_buffer.finalize(%ref) : (!vm.ref<!hal.command_buffer>) -> ()
+      vm.return %ref : !vm.ref<!hal.command_buffer>
+    }
+    vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32)
+    vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects}
+    vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...)
+    vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
+    vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 6 : i32}
+    vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>)
+    vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i64)
+    vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
+    vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects}
+    vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects}
+    vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64, %flags : i64) -> !vm.ref<!hal.buffer>
+    vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffer : !vm.ref<!hal.command_buffer>, %flags : i64, %binding_table : tuple<!vm.ref<!hal.buffer>, i64, i64> ...)
+    vm.import private @hal.devices.count() -> i32 attributes {nosideeffects}
+    vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects}
+    vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref<!hal.executable> attributes {nosideeffects}
+    vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i64) -> !vm.ref<!hal.fence>
+    vm.import private @hal.fence.await(%timeout_millis : i32, %flags : i64, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {vm.yield}
+    vm.rodata private @_utf8_input0_DCE99660CEB3F6B {alignment = 1 : i64} "input0"
+    vm.rodata private @_utf8_tensor_FC1814BC4A58F22A {alignment = 1 : i64} "tensor"
+    vm.rodata private @_utf8_input1_B898B726583C85DA {alignment = 1 : i64} "input1"
+    vm.func private @multiple_results(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> (!vm.ref<!hal.buffer_view>, !vm.ref<!hal.buffer_view>) attributes {iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+      %c16 = vm.const.i32 16
+      %c1 = vm.const.i32 1
+      %c553648160 = vm.const.i32 553648160
+      %c3075 = vm.const.i32 3075
+      %c48 = vm.const.i32 48
+      %c2 = vm.const.i64 2
+      %c8 = vm.const.i64 8
+      %c64 = vm.const.i64 64
+      %c128 = vm.const.i64 128
+      %zero = vm.const.i64.zero
+      %c-1 = vm.const.i64 -1
+      %null = vm.const.ref.zero : !vm.ref<!hal.fence>
+      %c-1_0 = vm.const.i32 -1
+      %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref<!hal.device>
+      %__multiple_results_memoize_result_0_device_0 = vm.global.load.ref immutable @__multiple_results_memoize_result_0_device_0 : !vm.ref<!hal.command_buffer>
+      %_utf8_input0_DCE99660CEB3F6B = vm.const.ref.rodata @_utf8_input0_DCE99660CEB3F6B : !vm.buffer
+      vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DCE99660CEB3F6B, %c553648160, %c1, [%c2]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
+      %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
+      %ref_1 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
+      %_utf8_tensor_FC1814BC4A58F22A = vm.const.ref.rodata @_utf8_tensor_FC1814BC4A58F22A : !vm.buffer
+      vm.call @hal.buffer.assert(%ref, %_utf8_tensor_FC1814BC4A58F22A, %ref_1, %c8, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
+      %_utf8_input1_B898B726583C85DA = vm.const.ref.rodata @_utf8_input1_B898B726583C85DA : !vm.buffer
+      vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_B898B726583C85DA, %c553648160, %c1, [%c2]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
+      %ref_2 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
+      %_utf8_tensor_FC1814BC4A58F22A_3 = vm.const.ref.rodata @_utf8_tensor_FC1814BC4A58F22A : !vm.buffer
+      vm.call @hal.buffer.assert(%ref_2, %_utf8_tensor_FC1814BC4A58F22A_3, %ref_1, %c8, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
+      %ref_4 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i64) -> !vm.ref<!hal.fence>
+      %ref_5 = vm.call @hal.device.queue.alloca(%__device_0, %c-1, %null, %ref_4, %zero, %c48, %c3075, %c128, %zero) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i64, i32, i32, i64, i64) -> !vm.ref<!hal.buffer>
+      %ref_6 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i64) -> !vm.ref<!hal.fence>
+      vm.call.variadic @hal.device.queue.execute.indirect(%__device_0, %c-1, %ref_4, %ref_6, %__multiple_results_memoize_result_0_device_0, %zero, [(%ref, %zero, %c8), (%ref_2, %zero, %c8), (%ref_5, %zero, %c128)]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer>, i64, tuple<!vm.ref<!hal.buffer>, i64, i64> ...)
+      %0 = vm.call.variadic @hal.fence.await(%c-1_0, %zero, [%ref_6]) : (i32, i64, !vm.ref<!hal.fence> ...) -> i32
+      vm.cond_br %0, ^bb2, ^bb1
+    ^bb1:  // pred: ^bb0
+      %ref_7 = vm.call.variadic @hal.buffer_view.create(%ref_5, %zero, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
+      %ref_8 = vm.call.variadic @hal.buffer_view.create(%ref_5, %c64, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
+      vm.return %ref_7, %ref_8 : !vm.ref<!hal.buffer_view>, !vm.ref<!hal.buffer_view>
+    ^bb2:  // pred: ^bb0
+      vm.fail %0, "failed to wait on timepoint"
+    }
+    vm.export @multiple_results attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}}
+  }
+}
+
+
+// -----// IR Dump After GlobalInitializationPass (iree-vm-global-initialization) //----- //
+vm.module public @module {
+  vm.global.ref private mutable @__device_0 : !vm.ref<!hal.device>
+  vm.global.ref private mutable @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref<!hal.executable>
+  vm.global.ref private mutable @__multiple_results_memoize_result_0_device_0 : !vm.ref<!hal.command_buffer>
+  vm.rodata private @_utf8_hal_device_id_C6650FF277232B5A {alignment = 1 : i64} "hal.device.id"
+  vm.rodata private @_utf8_local_1A8FF0278D7661D8 {alignment = 1 : i64} "local*"
+  vm.rodata private @_utf8_hal_executable_format_E03EECB63A2AAF52 {alignment = 1 : i64} "hal.executable.format"
+  vm.rodata private @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 {alignment = 1 : i64} "embedded-elf-arm_64"
+  vm.rodata private @multiple_results_dispatch_0_embedded_elf_arm_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8>
+  vm.func private @__multiple_results_memoize_apply() -> !vm.ref<!hal.command_buffer> attributes {inlining_policy = #util.inline.never} {
+    %c13 = vm.const.i32 13
+    %c28 = vm.const.i32 28
+    %c2 = vm.const.i32 2
+    %null = vm.const.ref.zero : !vm.ref<!hal.buffer>
+    %c1 = vm.const.i32 1
+    %c3 = vm.const.i32 3
+    %c64 = vm.const.i32 64
+    %c128 = vm.const.i64 128
+    %c8 = vm.const.i64 8
+    %zero = vm.const.i64.zero
+    %zero_0 = vm.const.i32.zero
+    %c-1 = vm.const.i64 -1
+    %__device_0 = vm.global.load.ref @__device_0 : !vm.ref<!hal.device>
+    %__device_0_executable_0_multiple_results_dispatch_0 = vm.global.load.ref @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref<!hal.executable>
+    %ref = vm.call @hal.command_buffer.create(%__device_0, %zero_0, %c3, %c-1, %c3) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
+    vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%zero_0], [(%zero_0, %zero_0, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64, i32 ..., tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
+    vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%c64], [(%zero_0, %c1, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64, i32 ..., tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
+    vm.call @hal.command_buffer.execution_barrier(%ref, %c28, %c13, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i64) -> ()
+    vm.call @hal.command_buffer.finalize(%ref) : (!vm.ref<!hal.command_buffer>) -> ()
+    vm.return %ref : !vm.ref<!hal.command_buffer>
+  }
+  vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32)
+  vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects}
+  vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...)
+  vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
+  vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 6 : i32}
+  vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>)
+  vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i64)
+  vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
+  vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects}
+  vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects}
+  vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64, %flags : i64) -> !vm.ref<!hal.buffer>
+  vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffer : !vm.ref<!hal.command_buffer>, %flags : i64, %binding_table : tuple<!vm.ref<!hal.buffer>, i64, i64> ...)
+  vm.import private @hal.devices.count() -> i32 attributes {nosideeffects}
+  vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects}
+  vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref<!hal.executable> attributes {nosideeffects}
+  vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i64) -> !vm.ref<!hal.fence>
+  vm.import private @hal.fence.await(%timeout_millis : i32, %flags : i64, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {vm.yield}
+  vm.rodata private @_utf8_input0_DCE99660CEB3F6B {alignment = 1 : i64} "input0"
+  vm.rodata private @_utf8_tensor_FC1814BC4A58F22A {alignment = 1 : i64} "tensor"
+  vm.rodata private @_utf8_input1_B898B726583C85DA {alignment = 1 : i64} "input1"
+  vm.func private @multiple_results(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> (!vm.ref<!hal.buffer_view>, !vm.ref<!hal.buffer_view>) attributes {iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c16 = vm.const.i32 16
+    %c1 = vm.const.i32 1
+    %c553648160 = vm.const.i32 553648160
+    %c3075 = vm.const.i32 3075
+    %c48 = vm.const.i32 48
+    %c2 = vm.const.i64 2
+    %c8 = vm.const.i64 8
+    %c64 = vm.const.i64 64
+    %c128 = vm.const.i64 128
+    %zero = vm.const.i64.zero
+    %c-1 = vm.const.i64 -1
+    %null = vm.const.ref.zero : !vm.ref<!hal.fence>
+    %c-1_0 = vm.const.i32 -1
+    %__device_0 = vm.global.load.ref @__device_0 : !vm.ref<!hal.device>
+    %__multiple_results_memoize_result_0_device_0 = vm.global.load.ref @__multiple_results_memoize_result_0_device_0 : !vm.ref<!hal.command_buffer>
+    %_utf8_input0_DCE99660CEB3F6B = vm.const.ref.rodata @_utf8_input0_DCE99660CEB3F6B : !vm.buffer
+    vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DCE99660CEB3F6B, %c553648160, %c1, [%c2]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
+    %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
+    %ref_1 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
+    %_utf8_tensor_FC1814BC4A58F22A = vm.const.ref.rodata @_utf8_tensor_FC1814BC4A58F22A : !vm.buffer
+    vm.call @hal.buffer.assert(%ref, %_utf8_tensor_FC1814BC4A58F22A, %ref_1, %c8, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
+    %_utf8_input1_B898B726583C85DA = vm.const.ref.rodata @_utf8_input1_B898B726583C85DA : !vm.buffer
+    vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_B898B726583C85DA, %c553648160, %c1, [%c2]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
+    %ref_2 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
+    %_utf8_tensor_FC1814BC4A58F22A_3 = vm.const.ref.rodata @_utf8_tensor_FC1814BC4A58F22A : !vm.buffer
+    vm.call @hal.buffer.assert(%ref_2, %_utf8_tensor_FC1814BC4A58F22A_3, %ref_1, %c8, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
+    %ref_4 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i64) -> !vm.ref<!hal.fence>
+    %ref_5 = vm.call @hal.device.queue.alloca(%__device_0, %c-1, %null, %ref_4, %zero, %c48, %c3075, %c128, %zero) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i64, i32, i32, i64, i64) -> !vm.ref<!hal.buffer>
+    %ref_6 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i64) -> !vm.ref<!hal.fence>
+    vm.call.variadic @hal.device.queue.execute.indirect(%__device_0, %c-1, %ref_4, %ref_6, %__multiple_results_memoize_result_0_device_0, %zero, [(%ref, %zero, %c8), (%ref_2, %zero, %c8), (%ref_5, %zero, %c128)]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer>, i64, tuple<!vm.ref<!hal.buffer>, i64, i64> ...)
+    %0 = vm.call.variadic @hal.fence.await(%c-1_0, %zero, [%ref_6]) : (i32, i64, !vm.ref<!hal.fence> ...) -> i32
+    vm.cond_br %0, ^bb2, ^bb1
+  ^bb1:  // pred: ^bb0
+    %ref_7 = vm.call.variadic @hal.buffer_view.create(%ref_5, %zero, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
+    %ref_8 = vm.call.variadic @hal.buffer_view.create(%ref_5, %c64, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
+    vm.return %ref_7, %ref_8 : !vm.ref<!hal.buffer_view>, !vm.ref<!hal.buffer_view>
+  ^bb2:  // pred: ^bb0
+    vm.fail %0, "failed to wait on timepoint"
+  }
+  vm.export @multiple_results attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}}
+  vm.export @__init
+  vm.func private @__init() {
+    %c1 = vm.const.i32 1
+    %null = vm.const.ref.zero : !vm.buffer
+    %c14 = vm.const.i32 14
+    %c-1 = vm.const.i64 -1
+    %c18 = vm.const.i32 18
+    %zero = vm.const.i32.zero
+    %zero_0 = vm.const.i64.zero
+    %c1_1 = vm.const.i64 1
+    %null_2 = vm.const.ref.zero : !vm.ref<!hal.device>
+    %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
+    %1 = vm.ext.i32.i64.s %0 : i32 -> i64
+    vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref<!hal.device>)
+  ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
+    %rnz = vm.cmp.nz.ref %4 : !vm.ref<!hal.device>
+    %5 = vm.xor.i32 %rnz, %c1 : i32
+    %slt = vm.cmp.lt.i64.s %2, %1 : i64
+    %6 = vm.and.i32 %5, %slt : i32
+    vm.cond_br %6, ^bb2, ^bb5
+  ^bb2:  // pred: ^bb1
+    %7 = vm.trunc.i64.i32 %2 : i64 -> i32
+    %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
+    %_utf8_hal_device_id_C6650FF277232B5A = vm.const.ref.rodata @_utf8_hal_device_id_C6650FF277232B5A : !vm.buffer
+    %_utf8_local_1A8FF0278D7661D8 = vm.const.ref.rodata @_utf8_local_1A8FF0278D7661D8 : !vm.buffer
+    %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_C6650FF277232B5A, %_utf8_local_1A8FF0278D7661D8) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
+    %nz = vm.cmp.nz.i64 %8#1 : i64
+    %9 = vm.select.i32 %8#0, %nz, %zero : i32
+    vm.cond_br %9, ^bb3, ^bb4(%zero : i32)
+  ^bb3:  // pred: ^bb2
+    %_utf8_hal_executable_format_E03EECB63A2AAF52 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer
+    %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer
+    %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_E03EECB63A2AAF52, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
+    %nz_3 = vm.cmp.nz.i64 %10#1 : i64
+    %11 = vm.select.i32 %10#0, %nz_3, %zero : i32
+    vm.br ^bb4(%11 : i32)
+  ^bb4(%12: i32):  // 2 preds: ^bb2, ^bb3
+    %eq = vm.cmp.eq.i64 %3, %zero_0 : i64
+    %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64
+    %14 = vm.add.i64 %3, %13 : i64
+    %15 = vm.and.i32 %12, %eq : i32
+    %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref<!hal.device>
+    %16 = vm.add.i64 %2, %c1_1 : i64
+    vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref<!hal.device>)
+  ^bb5:  // pred: ^bb1
+    vm.cond_br %5, ^bb6, ^bb7
+  ^bb6:  // pred: ^bb5
+    vm.fail %c18, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>"
+  ^bb7:  // pred: ^bb5
+    %_utf8_hal_executable_format_E03EECB63A2AAF52_5 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer
+    %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer
+    %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_E03EECB63A2AAF52_5, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
+    %nz_7 = vm.cmp.nz.i64 %17#1 : i64
+    %18 = vm.select.i32 %17#0, %nz_7, %zero : i32
+    %19 = vm.select.i64 %18, %zero_0, %c-1 : i64
+    %eq_8 = vm.cmp.eq.i64 %19, %zero_0 : i64
+    vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
+    vm.cond_br %eq_8, ^bb8, ^bb9
+  ^bb8:  // pred: ^bb7
+    %multiple_results_dispatch_0_embedded_elf_arm_64 = vm.const.ref.rodata @multiple_results_dispatch_0_embedded_elf_arm_64 : !vm.buffer
+    %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_9 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer
+    %ref_10 = vm.call @hal.executable.create(%4, %c-1, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_9, %multiple_results_dispatch_0_embedded_elf_arm_64, %null) {nosideeffects} : (!vm.ref<!hal.device>, i64, !vm.buffer, !vm.buffer, !vm.buffer) -> !vm.ref<!hal.executable>
+    vm.global.store.ref %ref_10, @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref<!hal.executable>
+    %ref_11 = vm.call @__multiple_results_memoize_apply() : () -> !vm.ref<!hal.command_buffer>
+    vm.global.store.ref %ref_11, @__multiple_results_memoize_result_0_device_0 : !vm.ref<!hal.command_buffer>
+    vm.br ^bb10
+  ^bb9:  // pred: ^bb7
+    vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+  ^bb10:  // pred: ^bb8
+    vm.return
+  }
+  vm.export @__deinit
+  vm.func private @__deinit() {
+    vm.return
+  }
+}
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+module attributes {vm.toplevel} {
+  vm.module public @module {
+    vm.global.ref private mutable @__device_0 : !vm.ref<!hal.device>
+    vm.global.ref private mutable @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref<!hal.executable>
+    vm.global.ref private mutable @__multiple_results_memoize_result_0_device_0 : !vm.ref<!hal.command_buffer>
+    vm.rodata private @_utf8_hal_device_id_C6650FF277232B5A {alignment = 1 : i64} "hal.device.id"
+    vm.rodata private @_utf8_local_1A8FF0278D7661D8 {alignment = 1 : i64} "local*"
+    vm.rodata private @_utf8_hal_executable_format_E03EECB63A2AAF52 {alignment = 1 : i64} "hal.executable.format"
+    vm.rodata private @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 {alignment = 1 : i64} "embedded-elf-arm_64"
+    vm.rodata private @multiple_results_dispatch_0_embedded_elf_arm_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8>
+    vm.func private @__multiple_results_memoize_apply() -> !vm.ref<!hal.command_buffer> attributes {inlining_policy = #util.inline.never} {
+      %c13 = vm.const.i32 13
+      %c28 = vm.const.i32 28
+      %c2 = vm.const.i32 2
+      %null = vm.const.ref.zero : !vm.ref<!hal.buffer>
+      %c1 = vm.const.i32 1
+      %c3 = vm.const.i32 3
+      %c64 = vm.const.i32 64
+      %c128 = vm.const.i64 128
+      %c8 = vm.const.i64 8
+      %zero = vm.const.i64.zero
+      %zero_0 = vm.const.i32.zero
+      %c-1 = vm.const.i64 -1
+      %__device_0 = vm.global.load.ref @__device_0 : !vm.ref<!hal.device>
+      %__device_0_executable_0_multiple_results_dispatch_0 = vm.global.load.ref @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref<!hal.executable>
+      %ref = vm.call @hal.command_buffer.create(%__device_0, %zero_0, %c3, %c-1, %c3) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
+      vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%zero_0], [(%zero_0, %zero_0, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64, i32 ..., tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
+      vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%c64], [(%zero_0, %c1, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64, i32 ..., tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
+      vm.call @hal.command_buffer.execution_barrier(%ref, %c28, %c13, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i64) -> ()
+      vm.call @hal.command_buffer.finalize(%ref) : (!vm.ref<!hal.command_buffer>) -> ()
+      vm.return %ref : !vm.ref<!hal.command_buffer>
+    }
+    vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32)
+    vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects}
+    vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...)
+    vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
+    vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 6 : i32}
+    vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>)
+    vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i64)
+    vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
+    vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects}
+    vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects}
+    vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64, %flags : i64) -> !vm.ref<!hal.buffer>
+    vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffer : !vm.ref<!hal.command_buffer>, %flags : i64, %binding_table : tuple<!vm.ref<!hal.buffer>, i64, i64> ...)
+    vm.import private @hal.devices.count() -> i32 attributes {nosideeffects}
+    vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects}
+    vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref<!hal.executable> attributes {nosideeffects}
+    vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i64) -> !vm.ref<!hal.fence>
+    vm.import private @hal.fence.await(%timeout_millis : i32, %flags : i64, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {vm.yield}
+    vm.rodata private @_utf8_input0_DCE99660CEB3F6B {alignment = 1 : i64} "input0"
+    vm.rodata private @_utf8_tensor_FC1814BC4A58F22A {alignment = 1 : i64} "tensor"
+    vm.rodata private @_utf8_input1_B898B726583C85DA {alignment = 1 : i64} "input1"
+    vm.func private @multiple_results(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> (!vm.ref<!hal.buffer_view>, !vm.ref<!hal.buffer_view>) attributes {iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+      %c16 = vm.const.i32 16
+      %c1 = vm.const.i32 1
+      %c553648160 = vm.const.i32 553648160
+      %c3075 = vm.const.i32 3075
+      %c48 = vm.const.i32 48
+      %c2 = vm.const.i64 2
+      %c8 = vm.const.i64 8
+      %c64 = vm.const.i64 64
+      %c128 = vm.const.i64 128
+      %zero = vm.const.i64.zero
+      %c-1 = vm.const.i64 -1
+      %null = vm.const.ref.zero : !vm.ref<!hal.fence>
+      %c-1_0 = vm.const.i32 -1
+      %__device_0 = vm.global.load.ref @__device_0 : !vm.ref<!hal.device>
+      %__multiple_results_memoize_result_0_device_0 = vm.global.load.ref @__multiple_results_memoize_result_0_device_0 : !vm.ref<!hal.command_buffer>
+      %_utf8_input0_DCE99660CEB3F6B = vm.const.ref.rodata @_utf8_input0_DCE99660CEB3F6B : !vm.buffer
+      vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DCE99660CEB3F6B, %c553648160, %c1, [%c2]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
+      %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
+      %ref_1 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
+      %_utf8_tensor_FC1814BC4A58F22A = vm.const.ref.rodata @_utf8_tensor_FC1814BC4A58F22A : !vm.buffer
+      vm.call @hal.buffer.assert(%ref, %_utf8_tensor_FC1814BC4A58F22A, %ref_1, %c8, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
+      %_utf8_input1_B898B726583C85DA = vm.const.ref.rodata @_utf8_input1_B898B726583C85DA : !vm.buffer
+      vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_B898B726583C85DA, %c553648160, %c1, [%c2]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
+      %ref_2 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
+      %_utf8_tensor_FC1814BC4A58F22A_3 = vm.const.ref.rodata @_utf8_tensor_FC1814BC4A58F22A : !vm.buffer
+      vm.call @hal.buffer.assert(%ref_2, %_utf8_tensor_FC1814BC4A58F22A_3, %ref_1, %c8, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
+      %ref_4 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i64) -> !vm.ref<!hal.fence>
+      %ref_5 = vm.call @hal.device.queue.alloca(%__device_0, %c-1, %null, %ref_4, %zero, %c48, %c3075, %c128, %zero) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i64, i32, i32, i64, i64) -> !vm.ref<!hal.buffer>
+      %ref_6 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i64) -> !vm.ref<!hal.fence>
+      vm.call.variadic @hal.device.queue.execute.indirect(%__device_0, %c-1, %ref_4, %ref_6, %__multiple_results_memoize_result_0_device_0, %zero, [(%ref, %zero, %c8), (%ref_2, %zero, %c8), (%ref_5, %zero, %c128)]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer>, i64, tuple<!vm.ref<!hal.buffer>, i64, i64> ...)
+      %0 = vm.call.variadic @hal.fence.await(%c-1_0, %zero, [%ref_6]) : (i32, i64, !vm.ref<!hal.fence> ...) -> i32
+      vm.cond_br %0, ^bb2, ^bb1
+    ^bb1:  // pred: ^bb0
+      %ref_7 = vm.call.variadic @hal.buffer_view.create(%ref_5, %zero, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
+      %ref_8 = vm.call.variadic @hal.buffer_view.create(%ref_5, %c64, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
+      vm.return %ref_7, %ref_8 : !vm.ref<!hal.buffer_view>, !vm.ref<!hal.buffer_view>
+    ^bb2:  // pred: ^bb0
+      vm.fail %0, "failed to wait on timepoint"
+    }
+    vm.export @multiple_results attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}}
+    vm.export @__init
+    vm.func private @__init() {
+      %c1 = vm.const.i32 1
+      %null = vm.const.ref.zero : !vm.buffer
+      %c14 = vm.const.i32 14
+      %c-1 = vm.const.i64 -1
+      %c18 = vm.const.i32 18
+      %zero = vm.const.i32.zero
+      %zero_0 = vm.const.i64.zero
+      %c1_1 = vm.const.i64 1
+      %null_2 = vm.const.ref.zero : !vm.ref<!hal.device>
+      %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
+      %1 = vm.ext.i32.i64.s %0 : i32 -> i64
+      vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref<!hal.device>)
+    ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
+      %rnz = vm.cmp.nz.ref %4 : !vm.ref<!hal.device>
+      %5 = vm.xor.i32 %rnz, %c1 : i32
+      %slt = vm.cmp.lt.i64.s %2, %1 : i64
+      %6 = vm.and.i32 %5, %slt : i32
+      vm.cond_br %6, ^bb2, ^bb5
+    ^bb2:  // pred: ^bb1
+      %7 = vm.trunc.i64.i32 %2 : i64 -> i32
+      %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
+      %_utf8_hal_device_id_C6650FF277232B5A = vm.const.ref.rodata @_utf8_hal_device_id_C6650FF277232B5A : !vm.buffer
+      %_utf8_local_1A8FF0278D7661D8 = vm.const.ref.rodata @_utf8_local_1A8FF0278D7661D8 : !vm.buffer
+      %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_C6650FF277232B5A, %_utf8_local_1A8FF0278D7661D8) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
+      %nz = vm.cmp.nz.i64 %8#1 : i64
+      %9 = vm.select.i32 %8#0, %nz, %zero : i32
+      vm.cond_br %9, ^bb3, ^bb4(%zero : i32)
+    ^bb3:  // pred: ^bb2
+      %_utf8_hal_executable_format_E03EECB63A2AAF52 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer
+      %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer
+      %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_E03EECB63A2AAF52, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
+      %nz_3 = vm.cmp.nz.i64 %10#1 : i64
+      %11 = vm.select.i32 %10#0, %nz_3, %zero : i32
+      vm.br ^bb4(%11 : i32)
+    ^bb4(%12: i32):  // 2 preds: ^bb2, ^bb3
+      %eq = vm.cmp.eq.i64 %3, %zero_0 : i64
+      %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64
+      %14 = vm.add.i64 %3, %13 : i64
+      %15 = vm.and.i32 %12, %eq : i32
+      %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref<!hal.device>
+      %16 = vm.add.i64 %2, %c1_1 : i64
+      vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref<!hal.device>)
+    ^bb5:  // pred: ^bb1
+      vm.cond_br %5, ^bb6, ^bb7
+    ^bb6:  // pred: ^bb5
+      vm.fail %c18, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>"
+    ^bb7:  // pred: ^bb5
+      %_utf8_hal_executable_format_E03EECB63A2AAF52_5 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer
+      %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer
+      %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_E03EECB63A2AAF52_5, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
+      %nz_7 = vm.cmp.nz.i64 %17#1 : i64
+      %18 = vm.select.i32 %17#0, %nz_7, %zero : i32
+      %19 = vm.select.i64 %18, %zero_0, %c-1 : i64
+      %eq_8 = vm.cmp.eq.i64 %19, %zero_0 : i64
+      vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
+      vm.cond_br %eq_8, ^bb8, ^bb9
+    ^bb8:  // pred: ^bb7
+      %multiple_results_dispatch_0_embedded_elf_arm_64 = vm.const.ref.rodata @multiple_results_dispatch_0_embedded_elf_arm_64 : !vm.buffer
+      %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_9 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer
+      %ref_10 = vm.call @hal.executable.create(%4, %c-1, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_9, %multiple_results_dispatch_0_embedded_elf_arm_64, %null) {nosideeffects} : (!vm.ref<!hal.device>, i64, !vm.buffer, !vm.buffer, !vm.buffer) -> !vm.ref<!hal.executable>
+      vm.global.store.ref %ref_10, @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref<!hal.executable>
+      %ref_11 = vm.call @__multiple_results_memoize_apply() : () -> !vm.ref<!hal.command_buffer>
+      vm.global.store.ref %ref_11, @__multiple_results_memoize_result_0_device_0 : !vm.ref<!hal.command_buffer>
+      vm.return
+    ^bb9:  // pred: ^bb7
+      vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+    }
+    vm.export @__deinit
+    vm.func private @__deinit() {
+      vm.return
+    }
+  }
+}
+
+
+// -----// IR Dump After CSE (cse) //----- //
+module attributes {vm.toplevel} {
+  vm.module public @module {
+    vm.global.ref private mutable @__device_0 : !vm.ref<!hal.device>
+    vm.global.ref private mutable @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref<!hal.executable>
+    vm.global.ref private mutable @__multiple_results_memoize_result_0_device_0 : !vm.ref<!hal.command_buffer>
+    vm.rodata private @_utf8_hal_device_id_C6650FF277232B5A {alignment = 1 : i64} "hal.device.id"
+    vm.rodata private @_utf8_local_1A8FF0278D7661D8 {alignment = 1 : i64} "local*"
+    vm.rodata private @_utf8_hal_executable_format_E03EECB63A2AAF52 {alignment = 1 : i64} "hal.executable.format"
+    vm.rodata private @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 {alignment = 1 : i64} "embedded-elf-arm_64"
+    vm.rodata private @multiple_results_dispatch_0_embedded_elf_arm_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8>
+    vm.func private @__multiple_results_memoize_apply() -> !vm.ref<!hal.command_buffer> attributes {inlining_policy = #util.inline.never} {
+      %c13 = vm.const.i32 13
+      %c28 = vm.const.i32 28
+      %c2 = vm.const.i32 2
+      %null = vm.const.ref.zero : !vm.ref<!hal.buffer>
+      %c1 = vm.const.i32 1
+      %c3 = vm.const.i32 3
+      %c64 = vm.const.i32 64
+      %c128 = vm.const.i64 128
+      %c8 = vm.const.i64 8
+      %zero = vm.const.i64.zero
+      %zero_0 = vm.const.i32.zero
+      %c-1 = vm.const.i64 -1
+      %__device_0 = vm.global.load.ref @__device_0 : !vm.ref<!hal.device>
+      %__device_0_executable_0_multiple_results_dispatch_0 = vm.global.load.ref @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref<!hal.executable>
+      %ref = vm.call @hal.command_buffer.create(%__device_0, %zero_0, %c3, %c-1, %c3) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
+      vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%zero_0], [(%zero_0, %zero_0, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64, i32 ..., tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
+      vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%c64], [(%zero_0, %c1, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64, i32 ..., tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
+      vm.call @hal.command_buffer.execution_barrier(%ref, %c28, %c13, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i64) -> ()
+      vm.call @hal.command_buffer.finalize(%ref) : (!vm.ref<!hal.command_buffer>) -> ()
+      vm.return %ref : !vm.ref<!hal.command_buffer>
+    }
+    vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32)
+    vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects}
+    vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...)
+    vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
+    vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 6 : i32}
+    vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>)
+    vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i64)
+    vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
+    vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects}
+    vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects}
+    vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64, %flags : i64) -> !vm.ref<!hal.buffer>
+    vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffer : !vm.ref<!hal.command_buffer>, %flags : i64, %binding_table : tuple<!vm.ref<!hal.buffer>, i64, i64> ...)
+    vm.import private @hal.devices.count() -> i32 attributes {nosideeffects}
+    vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects}
+    vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref<!hal.executable> attributes {nosideeffects}
+    vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i64) -> !vm.ref<!hal.fence>
+    vm.import private @hal.fence.await(%timeout_millis : i32, %flags : i64, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {vm.yield}
+    vm.rodata private @_utf8_input0_DCE99660CEB3F6B {alignment = 1 : i64} "input0"
+    vm.rodata private @_utf8_tensor_FC1814BC4A58F22A {alignment = 1 : i64} "tensor"
+    vm.rodata private @_utf8_input1_B898B726583C85DA {alignment = 1 : i64} "input1"
+    vm.func private @multiple_results(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> (!vm.ref<!hal.buffer_view>, !vm.ref<!hal.buffer_view>) attributes {iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+      %c16 = vm.const.i32 16
+      %c1 = vm.const.i32 1
+      %c553648160 = vm.const.i32 553648160
+      %c3075 = vm.const.i32 3075
+      %c48 = vm.const.i32 48
+      %c2 = vm.const.i64 2
+      %c8 = vm.const.i64 8
+      %c64 = vm.const.i64 64
+      %c128 = vm.const.i64 128
+      %zero = vm.const.i64.zero
+      %c-1 = vm.const.i64 -1
+      %null = vm.const.ref.zero : !vm.ref<!hal.fence>
+      %c-1_0 = vm.const.i32 -1
+      %__device_0 = vm.global.load.ref @__device_0 : !vm.ref<!hal.device>
+      %__multiple_results_memoize_result_0_device_0 = vm.global.load.ref @__multiple_results_memoize_result_0_device_0 : !vm.ref<!hal.command_buffer>
+      %_utf8_input0_DCE99660CEB3F6B = vm.const.ref.rodata @_utf8_input0_DCE99660CEB3F6B : !vm.buffer
+      vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DCE99660CEB3F6B, %c553648160, %c1, [%c2]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
+      %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
+      %ref_1 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
+      %_utf8_tensor_FC1814BC4A58F22A = vm.const.ref.rodata @_utf8_tensor_FC1814BC4A58F22A : !vm.buffer
+      vm.call @hal.buffer.assert(%ref, %_utf8_tensor_FC1814BC4A58F22A, %ref_1, %c8, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
+      %_utf8_input1_B898B726583C85DA = vm.const.ref.rodata @_utf8_input1_B898B726583C85DA : !vm.buffer
+      vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_B898B726583C85DA, %c553648160, %c1, [%c2]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
+      %ref_2 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
+      vm.call @hal.buffer.assert(%ref_2, %_utf8_tensor_FC1814BC4A58F22A, %ref_1, %c8, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
+      %ref_3 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i64) -> !vm.ref<!hal.fence>
+      %ref_4 = vm.call @hal.device.queue.alloca(%__device_0, %c-1, %null, %ref_3, %zero, %c48, %c3075, %c128, %zero) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i64, i32, i32, i64, i64) -> !vm.ref<!hal.buffer>
+      %ref_5 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i64) -> !vm.ref<!hal.fence>
+      vm.call.variadic @hal.device.queue.execute.indirect(%__device_0, %c-1, %ref_3, %ref_5, %__multiple_results_memoize_result_0_device_0, %zero, [(%ref, %zero, %c8), (%ref_2, %zero, %c8), (%ref_4, %zero, %c128)]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer>, i64, tuple<!vm.ref<!hal.buffer>, i64, i64> ...)
+      %0 = vm.call.variadic @hal.fence.await(%c-1_0, %zero, [%ref_5]) : (i32, i64, !vm.ref<!hal.fence> ...) -> i32
+      vm.cond_br %0, ^bb2, ^bb1
+    ^bb1:  // pred: ^bb0
+      %ref_6 = vm.call.variadic @hal.buffer_view.create(%ref_4, %zero, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
+      %ref_7 = vm.call.variadic @hal.buffer_view.create(%ref_4, %c64, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
+      vm.return %ref_6, %ref_7 : !vm.ref<!hal.buffer_view>, !vm.ref<!hal.buffer_view>
+    ^bb2:  // pred: ^bb0
+      vm.fail %0, "failed to wait on timepoint"
+    }
+    vm.export @multiple_results attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}}
+    vm.export @__init
+    vm.func private @__init() {
+      %c1 = vm.const.i32 1
+      %null = vm.const.ref.zero : !vm.buffer
+      %c14 = vm.const.i32 14
+      %c-1 = vm.const.i64 -1
+      %c18 = vm.const.i32 18
+      %zero = vm.const.i32.zero
+      %zero_0 = vm.const.i64.zero
+      %c1_1 = vm.const.i64 1
+      %null_2 = vm.const.ref.zero : !vm.ref<!hal.device>
+      %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
+      %1 = vm.ext.i32.i64.s %0 : i32 -> i64
+      vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref<!hal.device>)
+    ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
+      %rnz = vm.cmp.nz.ref %4 : !vm.ref<!hal.device>
+      %5 = vm.xor.i32 %rnz, %c1 : i32
+      %slt = vm.cmp.lt.i64.s %2, %1 : i64
+      %6 = vm.and.i32 %5, %slt : i32
+      vm.cond_br %6, ^bb2, ^bb5
+    ^bb2:  // pred: ^bb1
+      %7 = vm.trunc.i64.i32 %2 : i64 -> i32
+      %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
+      %_utf8_hal_device_id_C6650FF277232B5A = vm.const.ref.rodata @_utf8_hal_device_id_C6650FF277232B5A : !vm.buffer
+      %_utf8_local_1A8FF0278D7661D8 = vm.const.ref.rodata @_utf8_local_1A8FF0278D7661D8 : !vm.buffer
+      %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_C6650FF277232B5A, %_utf8_local_1A8FF0278D7661D8) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
+      %nz = vm.cmp.nz.i64 %8#1 : i64
+      %9 = vm.select.i32 %8#0, %nz, %zero : i32
+      vm.cond_br %9, ^bb3, ^bb4(%zero : i32)
+    ^bb3:  // pred: ^bb2
+      %_utf8_hal_executable_format_E03EECB63A2AAF52 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer
+      %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer
+      %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_E03EECB63A2AAF52, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
+      %nz_3 = vm.cmp.nz.i64 %10#1 : i64
+      %11 = vm.select.i32 %10#0, %nz_3, %zero : i32
+      vm.br ^bb4(%11 : i32)
+    ^bb4(%12: i32):  // 2 preds: ^bb2, ^bb3
+      %eq = vm.cmp.eq.i64 %3, %zero_0 : i64
+      %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64
+      %14 = vm.add.i64 %3, %13 : i64
+      %15 = vm.and.i32 %12, %eq : i32
+      %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref<!hal.device>
+      %16 = vm.add.i64 %2, %c1_1 : i64
+      vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref<!hal.device>)
+    ^bb5:  // pred: ^bb1
+      vm.cond_br %5, ^bb6, ^bb7
+    ^bb6:  // pred: ^bb5
+      vm.fail %c18, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>"
+    ^bb7:  // pred: ^bb5
+      %_utf8_hal_executable_format_E03EECB63A2AAF52_5 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer
+      %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer
+      %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_E03EECB63A2AAF52_5, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
+      %nz_7 = vm.cmp.nz.i64 %17#1 : i64
+      %18 = vm.select.i32 %17#0, %nz_7, %zero : i32
+      %19 = vm.select.i64 %18, %zero_0, %c-1 : i64
+      %eq_8 = vm.cmp.eq.i64 %19, %zero_0 : i64
+      vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
+      vm.cond_br %eq_8, ^bb8, ^bb9
+    ^bb8:  // pred: ^bb7
+      %multiple_results_dispatch_0_embedded_elf_arm_64 = vm.const.ref.rodata @multiple_results_dispatch_0_embedded_elf_arm_64 : !vm.buffer
+      %ref_9 = vm.call @hal.executable.create(%4, %c-1, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6, %multiple_results_dispatch_0_embedded_elf_arm_64, %null) {nosideeffects} : (!vm.ref<!hal.device>, i64, !vm.buffer, !vm.buffer, !vm.buffer) -> !vm.ref<!hal.executable>
+      vm.global.store.ref %ref_9, @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref<!hal.executable>
+      %ref_10 = vm.call @__multiple_results_memoize_apply() : () -> !vm.ref<!hal.command_buffer>
+      vm.global.store.ref %ref_10, @__multiple_results_memoize_result_0_device_0 : !vm.ref<!hal.command_buffer>
+      vm.return
+    ^bb9:  // pred: ^bb7
+      vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+    }
+    vm.export @__deinit
+    vm.func private @__deinit() {
+      vm.return
+    }
+  }
+}
+
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+module attributes {vm.toplevel} {
+  vm.module public @module {
+    vm.global.ref private mutable @__device_0 : !vm.ref<!hal.device>
+    vm.global.ref private mutable @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref<!hal.executable>
+    vm.global.ref private mutable @__multiple_results_memoize_result_0_device_0 : !vm.ref<!hal.command_buffer>
+    vm.rodata private @_utf8_hal_device_id_C6650FF277232B5A {alignment = 1 : i64} "hal.device.id"
+    vm.rodata private @_utf8_local_1A8FF0278D7661D8 {alignment = 1 : i64} "local*"
+    vm.rodata private @_utf8_hal_executable_format_E03EECB63A2AAF52 {alignment = 1 : i64} "hal.executable.format"
+    vm.rodata private @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 {alignment = 1 : i64} "embedded-elf-arm_64"
+    vm.rodata private @multiple_results_dispatch_0_embedded_elf_arm_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8>
+    vm.func private @__multiple_results_memoize_apply() -> !vm.ref<!hal.command_buffer> attributes {inlining_policy = #util.inline.never} {
+      %c13 = vm.const.i32 13
+      %c28 = vm.const.i32 28
+      %c2 = vm.const.i32 2
+      %null = vm.const.ref.zero : !vm.ref<!hal.buffer>
+      %c1 = vm.const.i32 1
+      %c3 = vm.const.i32 3
+      %c64 = vm.const.i32 64
+      %c128 = vm.const.i64 128
+      %c8 = vm.const.i64 8
+      %zero = vm.const.i64.zero
+      %zero_0 = vm.const.i32.zero
+      %c-1 = vm.const.i64 -1
+      %__device_0 = vm.global.load.ref @__device_0 : !vm.ref<!hal.device>
+      %__device_0_executable_0_multiple_results_dispatch_0 = vm.global.load.ref @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref<!hal.executable>
+      %ref = vm.call @hal.command_buffer.create(%__device_0, %zero_0, %c3, %c-1, %c3) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
+      vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%zero_0], [(%zero_0, %zero_0, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64, i32 ..., tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
+      vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%c64], [(%zero_0, %c1, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64, i32 ..., tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
+      vm.call @hal.command_buffer.execution_barrier(%ref, %c28, %c13, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i64) -> ()
+      vm.call @hal.command_buffer.finalize(%ref) : (!vm.ref<!hal.command_buffer>) -> ()
+      vm.return %ref : !vm.ref<!hal.command_buffer>
+    }
+    vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32)
+    vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects}
+    vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...)
+    vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
+    vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 6 : i32}
+    vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>)
+    vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i64)
+    vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
+    vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects}
+    vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects}
+    vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64, %flags : i64) -> !vm.ref<!hal.buffer>
+    vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffer : !vm.ref<!hal.command_buffer>, %flags : i64, %binding_table : tuple<!vm.ref<!hal.buffer>, i64, i64> ...)
+    vm.import private @hal.devices.count() -> i32 attributes {nosideeffects}
+    vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects}
+    vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref<!hal.executable> attributes {nosideeffects}
+    vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i64) -> !vm.ref<!hal.fence>
+    vm.import private @hal.fence.await(%timeout_millis : i32, %flags : i64, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {vm.yield}
+    vm.rodata private @_utf8_input0_DCE99660CEB3F6B {alignment = 1 : i64} "input0"
+    vm.rodata private @_utf8_tensor_FC1814BC4A58F22A {alignment = 1 : i64} "tensor"
+    vm.rodata private @_utf8_input1_B898B726583C85DA {alignment = 1 : i64} "input1"
+    vm.func private @multiple_results(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> (!vm.ref<!hal.buffer_view>, !vm.ref<!hal.buffer_view>) attributes {iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+      %c16 = vm.const.i32 16
+      %c1 = vm.const.i32 1
+      %c553648160 = vm.const.i32 553648160
+      %c3075 = vm.const.i32 3075
+      %c48 = vm.const.i32 48
+      %c2 = vm.const.i64 2
+      %c8 = vm.const.i64 8
+      %c64 = vm.const.i64 64
+      %c128 = vm.const.i64 128
+      %zero = vm.const.i64.zero
+      %c-1 = vm.const.i64 -1
+      %null = vm.const.ref.zero : !vm.ref<!hal.fence>
+      %c-1_0 = vm.const.i32 -1
+      %__device_0 = vm.global.load.ref @__device_0 : !vm.ref<!hal.device>
+      %__multiple_results_memoize_result_0_device_0 = vm.global.load.ref @__multiple_results_memoize_result_0_device_0 : !vm.ref<!hal.command_buffer>
+      %_utf8_input0_DCE99660CEB3F6B = vm.const.ref.rodata @_utf8_input0_DCE99660CEB3F6B : !vm.buffer
+      vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DCE99660CEB3F6B, %c553648160, %c1, [%c2]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
+      %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
+      %ref_1 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
+      %_utf8_tensor_FC1814BC4A58F22A = vm.const.ref.rodata @_utf8_tensor_FC1814BC4A58F22A : !vm.buffer
+      vm.call @hal.buffer.assert(%ref, %_utf8_tensor_FC1814BC4A58F22A, %ref_1, %c8, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
+      %_utf8_input1_B898B726583C85DA = vm.const.ref.rodata @_utf8_input1_B898B726583C85DA : !vm.buffer
+      vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_B898B726583C85DA, %c553648160, %c1, [%c2]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
+      %ref_2 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
+      vm.call @hal.buffer.assert(%ref_2, %_utf8_tensor_FC1814BC4A58F22A, %ref_1, %c8, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
+      %ref_3 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i64) -> !vm.ref<!hal.fence>
+      %ref_4 = vm.call @hal.device.queue.alloca(%__device_0, %c-1, %null, %ref_3, %zero, %c48, %c3075, %c128, %zero) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i64, i32, i32, i64, i64) -> !vm.ref<!hal.buffer>
+      %ref_5 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i64) -> !vm.ref<!hal.fence>
+      vm.call.variadic @hal.device.queue.execute.indirect(%__device_0, %c-1, %ref_3, %ref_5, %__multiple_results_memoize_result_0_device_0, %zero, [(%ref, %zero, %c8), (%ref_2, %zero, %c8), (%ref_4, %zero, %c128)]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer>, i64, tuple<!vm.ref<!hal.buffer>, i64, i64> ...)
+      %0 = vm.call.variadic @hal.fence.await(%c-1_0, %zero, [%ref_5]) : (i32, i64, !vm.ref<!hal.fence> ...) -> i32
+      vm.cond_br %0, ^bb2, ^bb1
+    ^bb1:  // pred: ^bb0
+      %ref_6 = vm.call.variadic @hal.buffer_view.create(%ref_4, %zero, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
+      %ref_7 = vm.call.variadic @hal.buffer_view.create(%ref_4, %c64, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
+      vm.return %ref_6, %ref_7 : !vm.ref<!hal.buffer_view>, !vm.ref<!hal.buffer_view>
+    ^bb2:  // pred: ^bb0
+      vm.fail %0, "failed to wait on timepoint"
+    }
+    vm.export @multiple_results attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}}
+    vm.export @__init
+    vm.func private @__init() {
+      %c1 = vm.const.i32 1
+      %null = vm.const.ref.zero : !vm.buffer
+      %c14 = vm.const.i32 14
+      %c-1 = vm.const.i64 -1
+      %c18 = vm.const.i32 18
+      %zero = vm.const.i32.zero
+      %zero_0 = vm.const.i64.zero
+      %c1_1 = vm.const.i64 1
+      %null_2 = vm.const.ref.zero : !vm.ref<!hal.device>
+      %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
+      %1 = vm.ext.i32.i64.s %0 : i32 -> i64
+      vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref<!hal.device>)
+    ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
+      %rnz = vm.cmp.nz.ref %4 : !vm.ref<!hal.device>
+      %5 = vm.xor.i32 %rnz, %c1 : i32
+      %slt = vm.cmp.lt.i64.s %2, %1 : i64
+      %6 = vm.and.i32 %5, %slt : i32
+      vm.cond_br %6, ^bb2, ^bb5
+    ^bb2:  // pred: ^bb1
+      %7 = vm.trunc.i64.i32 %2 : i64 -> i32
+      %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
+      %_utf8_hal_device_id_C6650FF277232B5A = vm.const.ref.rodata @_utf8_hal_device_id_C6650FF277232B5A : !vm.buffer
+      %_utf8_local_1A8FF0278D7661D8 = vm.const.ref.rodata @_utf8_local_1A8FF0278D7661D8 : !vm.buffer
+      %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_C6650FF277232B5A, %_utf8_local_1A8FF0278D7661D8) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
+      %nz = vm.cmp.nz.i64 %8#1 : i64
+      %9 = vm.select.i32 %8#0, %nz, %zero : i32
+      vm.cond_br %9, ^bb3, ^bb4(%zero : i32)
+    ^bb3:  // pred: ^bb2
+      %_utf8_hal_executable_format_E03EECB63A2AAF52 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer
+      %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer
+      %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_E03EECB63A2AAF52, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
+      %nz_3 = vm.cmp.nz.i64 %10#1 : i64
+      %11 = vm.select.i32 %10#0, %nz_3, %zero : i32
+      vm.br ^bb4(%11 : i32)
+    ^bb4(%12: i32):  // 2 preds: ^bb2, ^bb3
+      %eq = vm.cmp.eq.i64 %3, %zero_0 : i64
+      %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64
+      %14 = vm.add.i64 %3, %13 : i64
+      %15 = vm.and.i32 %12, %eq : i32
+      %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref<!hal.device>
+      %16 = vm.add.i64 %2, %c1_1 : i64
+      vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref<!hal.device>)
+    ^bb5:  // pred: ^bb1
+      vm.cond_br %5, ^bb6, ^bb7
+    ^bb6:  // pred: ^bb5
+      vm.fail %c18, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>"
+    ^bb7:  // pred: ^bb5
+      %_utf8_hal_executable_format_E03EECB63A2AAF52_5 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer
+      %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer
+      %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_E03EECB63A2AAF52_5, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
+      %nz_7 = vm.cmp.nz.i64 %17#1 : i64
+      %18 = vm.select.i32 %17#0, %nz_7, %zero : i32
+      %19 = vm.select.i64 %18, %zero_0, %c-1 : i64
+      %eq_8 = vm.cmp.eq.i64 %19, %zero_0 : i64
+      vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
+      vm.cond_br %eq_8, ^bb8, ^bb9
+    ^bb8:  // pred: ^bb7
+      %multiple_results_dispatch_0_embedded_elf_arm_64 = vm.const.ref.rodata @multiple_results_dispatch_0_embedded_elf_arm_64 : !vm.buffer
+      %ref_9 = vm.call @hal.executable.create(%4, %c-1, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6, %multiple_results_dispatch_0_embedded_elf_arm_64, %null) {nosideeffects} : (!vm.ref<!hal.device>, i64, !vm.buffer, !vm.buffer, !vm.buffer) -> !vm.ref<!hal.executable>
+      vm.global.store.ref %ref_9, @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref<!hal.executable>
+      %ref_10 = vm.call @__multiple_results_memoize_apply() : () -> !vm.ref<!hal.command_buffer>
+      vm.global.store.ref %ref_10, @__multiple_results_memoize_result_0_device_0 : !vm.ref<!hal.command_buffer>
+      vm.return
+    ^bb9:  // pred: ^bb7
+      vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+    }
+    vm.export @__deinit
+    vm.func private @__deinit() {
+      vm.return
+    }
+  }
+}
+
+
+// -----// IR Dump After DropEmptyModuleInitializersPass (iree-vm-drop-empty-module-initializers) //----- //
+vm.module public @module {
+  vm.global.ref private mutable @__device_0 : !vm.ref<!hal.device>
+  vm.global.ref private mutable @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref<!hal.executable>
+  vm.global.ref private mutable @__multiple_results_memoize_result_0_device_0 : !vm.ref<!hal.command_buffer>
+  vm.rodata private @_utf8_hal_device_id_C6650FF277232B5A {alignment = 1 : i64} "hal.device.id"
+  vm.rodata private @_utf8_local_1A8FF0278D7661D8 {alignment = 1 : i64} "local*"
+  vm.rodata private @_utf8_hal_executable_format_E03EECB63A2AAF52 {alignment = 1 : i64} "hal.executable.format"
+  vm.rodata private @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 {alignment = 1 : i64} "embedded-elf-arm_64"
+  vm.rodata private @multiple_results_dispatch_0_embedded_elf_arm_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8>
+  vm.func private @__multiple_results_memoize_apply() -> !vm.ref<!hal.command_buffer> attributes {inlining_policy = #util.inline.never} {
+    %c13 = vm.const.i32 13
+    %c28 = vm.const.i32 28
+    %c2 = vm.const.i32 2
+    %null = vm.const.ref.zero : !vm.ref<!hal.buffer>
+    %c1 = vm.const.i32 1
+    %c3 = vm.const.i32 3
+    %c64 = vm.const.i32 64
+    %c128 = vm.const.i64 128
+    %c8 = vm.const.i64 8
+    %zero = vm.const.i64.zero
+    %zero_0 = vm.const.i32.zero
+    %c-1 = vm.const.i64 -1
+    %__device_0 = vm.global.load.ref @__device_0 : !vm.ref<!hal.device>
+    %__device_0_executable_0_multiple_results_dispatch_0 = vm.global.load.ref @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref<!hal.executable>
+    %ref = vm.call @hal.command_buffer.create(%__device_0, %zero_0, %c3, %c-1, %c3) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
+    vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%zero_0], [(%zero_0, %zero_0, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64, i32 ..., tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
+    vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%c64], [(%zero_0, %c1, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64, i32 ..., tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
+    vm.call @hal.command_buffer.execution_barrier(%ref, %c28, %c13, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i64) -> ()
+    vm.call @hal.command_buffer.finalize(%ref) : (!vm.ref<!hal.command_buffer>) -> ()
+    vm.return %ref : !vm.ref<!hal.command_buffer>
+  }
+  vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32)
+  vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects}
+  vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...)
+  vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
+  vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 6 : i32}
+  vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>)
+  vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i64)
+  vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
+  vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects}
+  vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects}
+  vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64, %flags : i64) -> !vm.ref<!hal.buffer>
+  vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffer : !vm.ref<!hal.command_buffer>, %flags : i64, %binding_table : tuple<!vm.ref<!hal.buffer>, i64, i64> ...)
+  vm.import private @hal.devices.count() -> i32 attributes {nosideeffects}
+  vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects}
+  vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref<!hal.executable> attributes {nosideeffects}
+  vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i64) -> !vm.ref<!hal.fence>
+  vm.import private @hal.fence.await(%timeout_millis : i32, %flags : i64, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {vm.yield}
+  vm.rodata private @_utf8_input0_DCE99660CEB3F6B {alignment = 1 : i64} "input0"
+  vm.rodata private @_utf8_tensor_FC1814BC4A58F22A {alignment = 1 : i64} "tensor"
+  vm.rodata private @_utf8_input1_B898B726583C85DA {alignment = 1 : i64} "input1"
+  vm.func private @multiple_results(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> (!vm.ref<!hal.buffer_view>, !vm.ref<!hal.buffer_view>) attributes {iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} {
+    %c16 = vm.const.i32 16
+    %c1 = vm.const.i32 1
+    %c553648160 = vm.const.i32 553648160
+    %c3075 = vm.const.i32 3075
+    %c48 = vm.const.i32 48
+    %c2 = vm.const.i64 2
+    %c8 = vm.const.i64 8
+    %c64 = vm.const.i64 64
+    %c128 = vm.const.i64 128
+    %zero = vm.const.i64.zero
+    %c-1 = vm.const.i64 -1
+    %null = vm.const.ref.zero : !vm.ref<!hal.fence>
+    %c-1_0 = vm.const.i32 -1
+    %__device_0 = vm.global.load.ref @__device_0 : !vm.ref<!hal.device>
+    %__multiple_results_memoize_result_0_device_0 = vm.global.load.ref @__multiple_results_memoize_result_0_device_0 : !vm.ref<!hal.command_buffer>
+    %_utf8_input0_DCE99660CEB3F6B = vm.const.ref.rodata @_utf8_input0_DCE99660CEB3F6B : !vm.buffer
+    vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DCE99660CEB3F6B, %c553648160, %c1, [%c2]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
+    %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
+    %ref_1 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
+    %_utf8_tensor_FC1814BC4A58F22A = vm.const.ref.rodata @_utf8_tensor_FC1814BC4A58F22A : !vm.buffer
+    vm.call @hal.buffer.assert(%ref, %_utf8_tensor_FC1814BC4A58F22A, %ref_1, %c8, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
+    %_utf8_input1_B898B726583C85DA = vm.const.ref.rodata @_utf8_input1_B898B726583C85DA : !vm.buffer
+    vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_B898B726583C85DA, %c553648160, %c1, [%c2]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
+    %ref_2 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
+    vm.call @hal.buffer.assert(%ref_2, %_utf8_tensor_FC1814BC4A58F22A, %ref_1, %c8, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
+    %ref_3 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i64) -> !vm.ref<!hal.fence>
+    %ref_4 = vm.call @hal.device.queue.alloca(%__device_0, %c-1, %null, %ref_3, %zero, %c48, %c3075, %c128, %zero) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i64, i32, i32, i64, i64) -> !vm.ref<!hal.buffer>
+    %ref_5 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i64) -> !vm.ref<!hal.fence>
+    vm.call.variadic @hal.device.queue.execute.indirect(%__device_0, %c-1, %ref_3, %ref_5, %__multiple_results_memoize_result_0_device_0, %zero, [(%ref, %zero, %c8), (%ref_2, %zero, %c8), (%ref_4, %zero, %c128)]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer>, i64, tuple<!vm.ref<!hal.buffer>, i64, i64> ...)
+    %0 = vm.call.variadic @hal.fence.await(%c-1_0, %zero, [%ref_5]) : (i32, i64, !vm.ref<!hal.fence> ...) -> i32
+    vm.cond_br %0, ^bb2, ^bb1
+  ^bb1:  // pred: ^bb0
+    %ref_6 = vm.call.variadic @hal.buffer_view.create(%ref_4, %zero, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
+    %ref_7 = vm.call.variadic @hal.buffer_view.create(%ref_4, %c64, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
+    vm.return %ref_6, %ref_7 : !vm.ref<!hal.buffer_view>, !vm.ref<!hal.buffer_view>
+  ^bb2:  // pred: ^bb0
+    vm.fail %0, "failed to wait on timepoint"
+  }
+  vm.export @multiple_results attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}}
+  vm.export @__init
+  vm.func private @__init() {
+    %c1 = vm.const.i32 1
+    %null = vm.const.ref.zero : !vm.buffer
+    %c14 = vm.const.i32 14
+    %c-1 = vm.const.i64 -1
+    %c18 = vm.const.i32 18
+    %zero = vm.const.i32.zero
+    %zero_0 = vm.const.i64.zero
+    %c1_1 = vm.const.i64 1
+    %null_2 = vm.const.ref.zero : !vm.ref<!hal.device>
+    %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
+    %1 = vm.ext.i32.i64.s %0 : i32 -> i64
+    vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref<!hal.device>)
+  ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
+    %rnz = vm.cmp.nz.ref %4 : !vm.ref<!hal.device>
+    %5 = vm.xor.i32 %rnz, %c1 : i32
+    %slt = vm.cmp.lt.i64.s %2, %1 : i64
+    %6 = vm.and.i32 %5, %slt : i32
+    vm.cond_br %6, ^bb2, ^bb5
+  ^bb2:  // pred: ^bb1
+    %7 = vm.trunc.i64.i32 %2 : i64 -> i32
+    %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
+    %_utf8_hal_device_id_C6650FF277232B5A = vm.const.ref.rodata @_utf8_hal_device_id_C6650FF277232B5A : !vm.buffer
+    %_utf8_local_1A8FF0278D7661D8 = vm.const.ref.rodata @_utf8_local_1A8FF0278D7661D8 : !vm.buffer
+    %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_C6650FF277232B5A, %_utf8_local_1A8FF0278D7661D8) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
+    %nz = vm.cmp.nz.i64 %8#1 : i64
+    %9 = vm.select.i32 %8#0, %nz, %zero : i32
+    vm.cond_br %9, ^bb3, ^bb4(%zero : i32)
+  ^bb3:  // pred: ^bb2
+    %_utf8_hal_executable_format_E03EECB63A2AAF52 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer
+    %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer
+    %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_E03EECB63A2AAF52, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
+    %nz_3 = vm.cmp.nz.i64 %10#1 : i64
+    %11 = vm.select.i32 %10#0, %nz_3, %zero : i32
+    vm.br ^bb4(%11 : i32)
+  ^bb4(%12: i32):  // 2 preds: ^bb2, ^bb3
+    %eq = vm.cmp.eq.i64 %3, %zero_0 : i64
+    %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64
+    %14 = vm.add.i64 %3, %13 : i64
+    %15 = vm.and.i32 %12, %eq : i32
+    %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref<!hal.device>
+    %16 = vm.add.i64 %2, %c1_1 : i64
+    vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref<!hal.device>)
+  ^bb5:  // pred: ^bb1
+    vm.cond_br %5, ^bb6, ^bb7
+  ^bb6:  // pred: ^bb5
+    vm.fail %c18, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>"
+  ^bb7:  // pred: ^bb5
+    %_utf8_hal_executable_format_E03EECB63A2AAF52_5 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer
+    %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer
+    %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_E03EECB63A2AAF52_5, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
+    %nz_7 = vm.cmp.nz.i64 %17#1 : i64
+    %18 = vm.select.i32 %17#0, %nz_7, %zero : i32
+    %19 = vm.select.i64 %18, %zero_0, %c-1 : i64
+    %eq_8 = vm.cmp.eq.i64 %19, %zero_0 : i64
+    vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
+    vm.cond_br %eq_8, ^bb8, ^bb9
+  ^bb8:  // pred: ^bb7
+    %multiple_results_dispatch_0_embedded_elf_arm_64 = vm.const.ref.rodata @multiple_results_dispatch_0_embedded_elf_arm_64 : !vm.buffer
+    %ref_9 = vm.call @hal.executable.create(%4, %c-1, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6, %multiple_results_dispatch_0_embedded_elf_arm_64, %null) {nosideeffects} : (!vm.ref<!hal.device>, i64, !vm.buffer, !vm.buffer, !vm.buffer) -> !vm.ref<!hal.executable>
+    vm.global.store.ref %ref_9, @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref<!hal.executable>
+    %ref_10 = vm.call @__multiple_results_memoize_apply() : () -> !vm.ref<!hal.command_buffer>
+    vm.global.store.ref %ref_10, @__multiple_results_memoize_result_0_device_0 : !vm.ref<!hal.command_buffer>
+    vm.return
+  ^bb9:  // pred: ^bb7
+    vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+  }
+}
+
+// -----// IR Dump After AnnotateFunctionsPass (iree-vm-annotate-functions) //----- //
+vm.module public @module {
+  vm.global.ref private mutable @__device_0 : !vm.ref<!hal.device>
+  vm.global.ref private mutable @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref<!hal.executable>
+  vm.global.ref private mutable @__multiple_results_memoize_result_0_device_0 : !vm.ref<!hal.command_buffer>
+  vm.rodata private @_utf8_hal_device_id_C6650FF277232B5A {alignment = 1 : i64} "hal.device.id"
+  vm.rodata private @_utf8_local_1A8FF0278D7661D8 {alignment = 1 : i64} "local*"
+  vm.rodata private @_utf8_hal_executable_format_E03EECB63A2AAF52 {alignment = 1 : i64} "hal.executable.format"
+  vm.rodata private @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 {alignment = 1 : i64} "embedded-elf-arm_64"
+  vm.rodata private @multiple_results_dispatch_0_embedded_elf_arm_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8>
+  vm.func private @__multiple_results_memoize_apply() -> !vm.ref<!hal.command_buffer> attributes {inlining_policy = #util.inline.never, vm.unwind} {
+    %c13 = vm.const.i32 13
+    %c28 = vm.const.i32 28
+    %c2 = vm.const.i32 2
+    %null = vm.const.ref.zero : !vm.ref<!hal.buffer>
+    %c1 = vm.const.i32 1
+    %c3 = vm.const.i32 3
+    %c64 = vm.const.i32 64
+    %c128 = vm.const.i64 128
+    %c8 = vm.const.i64 8
+    %zero = vm.const.i64.zero
+    %zero_0 = vm.const.i32.zero
+    %c-1 = vm.const.i64 -1
+    %__device_0 = vm.global.load.ref @__device_0 : !vm.ref<!hal.device>
+    %__device_0_executable_0_multiple_results_dispatch_0 = vm.global.load.ref @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref<!hal.executable>
+    %ref = vm.call @hal.command_buffer.create(%__device_0, %zero_0, %c3, %c-1, %c3) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
+    vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%zero_0], [(%zero_0, %zero_0, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64, i32 ..., tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
+    vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%c64], [(%zero_0, %c1, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64, i32 ..., tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
+    vm.call @hal.command_buffer.execution_barrier(%ref, %c28, %c13, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i64) -> ()
+    vm.call @hal.command_buffer.finalize(%ref) : (!vm.ref<!hal.command_buffer>) -> ()
+    vm.return %ref : !vm.ref<!hal.command_buffer>
+  }
+  vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32)
+  vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects}
+  vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...)
+  vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
+  vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 6 : i32}
+  vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>)
+  vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i64)
+  vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
+  vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects}
+  vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects}
+  vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64, %flags : i64) -> !vm.ref<!hal.buffer>
+  vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffer : !vm.ref<!hal.command_buffer>, %flags : i64, %binding_table : tuple<!vm.ref<!hal.buffer>, i64, i64> ...)
+  vm.import private @hal.devices.count() -> i32 attributes {nosideeffects}
+  vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects}
+  vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref<!hal.executable> attributes {nosideeffects}
+  vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i64) -> !vm.ref<!hal.fence>
+  vm.import private @hal.fence.await(%timeout_millis : i32, %flags : i64, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {vm.yield}
+  vm.rodata private @_utf8_input0_DCE99660CEB3F6B {alignment = 1 : i64} "input0"
+  vm.rodata private @_utf8_tensor_FC1814BC4A58F22A {alignment = 1 : i64} "tensor"
+  vm.rodata private @_utf8_input1_B898B726583C85DA {alignment = 1 : i64} "input1"
+  vm.func private @multiple_results(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> (!vm.ref<!hal.buffer_view>, !vm.ref<!hal.buffer_view>) attributes {iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}, vm.unwind, vm.yield} {
+    %c16 = vm.const.i32 16
+    %c1 = vm.const.i32 1
+    %c553648160 = vm.const.i32 553648160
+    %c3075 = vm.const.i32 3075
+    %c48 = vm.const.i32 48
+    %c2 = vm.const.i64 2
+    %c8 = vm.const.i64 8
+    %c64 = vm.const.i64 64
+    %c128 = vm.const.i64 128
+    %zero = vm.const.i64.zero
+    %c-1 = vm.const.i64 -1
+    %null = vm.const.ref.zero : !vm.ref<!hal.fence>
+    %c-1_0 = vm.const.i32 -1
+    %__device_0 = vm.global.load.ref @__device_0 : !vm.ref<!hal.device>
+    %__multiple_results_memoize_result_0_device_0 = vm.global.load.ref @__multiple_results_memoize_result_0_device_0 : !vm.ref<!hal.command_buffer>
+    %_utf8_input0_DCE99660CEB3F6B = vm.const.ref.rodata @_utf8_input0_DCE99660CEB3F6B : !vm.buffer
+    vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DCE99660CEB3F6B, %c553648160, %c1, [%c2]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
+    %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
+    %ref_1 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
+    %_utf8_tensor_FC1814BC4A58F22A = vm.const.ref.rodata @_utf8_tensor_FC1814BC4A58F22A : !vm.buffer
+    vm.call @hal.buffer.assert(%ref, %_utf8_tensor_FC1814BC4A58F22A, %ref_1, %c8, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
+    %_utf8_input1_B898B726583C85DA = vm.const.ref.rodata @_utf8_input1_B898B726583C85DA : !vm.buffer
+    vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_B898B726583C85DA, %c553648160, %c1, [%c2]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
+    %ref_2 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
+    vm.call @hal.buffer.assert(%ref_2, %_utf8_tensor_FC1814BC4A58F22A, %ref_1, %c8, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
+    %ref_3 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i64) -> !vm.ref<!hal.fence>
+    %ref_4 = vm.call @hal.device.queue.alloca(%__device_0, %c-1, %null, %ref_3, %zero, %c48, %c3075, %c128, %zero) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i64, i32, i32, i64, i64) -> !vm.ref<!hal.buffer>
+    %ref_5 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i64) -> !vm.ref<!hal.fence>
+    vm.call.variadic @hal.device.queue.execute.indirect(%__device_0, %c-1, %ref_3, %ref_5, %__multiple_results_memoize_result_0_device_0, %zero, [(%ref, %zero, %c8), (%ref_2, %zero, %c8), (%ref_4, %zero, %c128)]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer>, i64, tuple<!vm.ref<!hal.buffer>, i64, i64> ...)
+    %0 = vm.call.variadic @hal.fence.await(%c-1_0, %zero, [%ref_5]) : (i32, i64, !vm.ref<!hal.fence> ...) -> i32
+    vm.cond_br %0, ^bb2, ^bb1
+  ^bb1:  // pred: ^bb0
+    %ref_6 = vm.call.variadic @hal.buffer_view.create(%ref_4, %zero, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
+    %ref_7 = vm.call.variadic @hal.buffer_view.create(%ref_4, %c64, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
+    vm.return %ref_6, %ref_7 : !vm.ref<!hal.buffer_view>, !vm.ref<!hal.buffer_view>
+  ^bb2:  // pred: ^bb0
+    vm.fail %0, "failed to wait on timepoint"
+  }
+  vm.export @multiple_results attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}}
+  vm.export @__init
+  vm.func private @__init() attributes {vm.unwind} {
+    %c1 = vm.const.i32 1
+    %null = vm.const.ref.zero : !vm.buffer
+    %c14 = vm.const.i32 14
+    %c-1 = vm.const.i64 -1
+    %c18 = vm.const.i32 18
+    %zero = vm.const.i32.zero
+    %zero_0 = vm.const.i64.zero
+    %c1_1 = vm.const.i64 1
+    %null_2 = vm.const.ref.zero : !vm.ref<!hal.device>
+    %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
+    %1 = vm.ext.i32.i64.s %0 : i32 -> i64
+    vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref<!hal.device>)
+  ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
+    %rnz = vm.cmp.nz.ref %4 : !vm.ref<!hal.device>
+    %5 = vm.xor.i32 %rnz, %c1 : i32
+    %slt = vm.cmp.lt.i64.s %2, %1 : i64
+    %6 = vm.and.i32 %5, %slt : i32
+    vm.cond_br %6, ^bb2, ^bb5
+  ^bb2:  // pred: ^bb1
+    %7 = vm.trunc.i64.i32 %2 : i64 -> i32
+    %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
+    %_utf8_hal_device_id_C6650FF277232B5A = vm.const.ref.rodata @_utf8_hal_device_id_C6650FF277232B5A : !vm.buffer
+    %_utf8_local_1A8FF0278D7661D8 = vm.const.ref.rodata @_utf8_local_1A8FF0278D7661D8 : !vm.buffer
+    %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_C6650FF277232B5A, %_utf8_local_1A8FF0278D7661D8) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
+    %nz = vm.cmp.nz.i64 %8#1 : i64
+    %9 = vm.select.i32 %8#0, %nz, %zero : i32
+    vm.cond_br %9, ^bb3, ^bb4(%zero : i32)
+  ^bb3:  // pred: ^bb2
+    %_utf8_hal_executable_format_E03EECB63A2AAF52 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer
+    %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer
+    %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_E03EECB63A2AAF52, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
+    %nz_3 = vm.cmp.nz.i64 %10#1 : i64
+    %11 = vm.select.i32 %10#0, %nz_3, %zero : i32
+    vm.br ^bb4(%11 : i32)
+  ^bb4(%12: i32):  // 2 preds: ^bb2, ^bb3
+    %eq = vm.cmp.eq.i64 %3, %zero_0 : i64
+    %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64
+    %14 = vm.add.i64 %3, %13 : i64
+    %15 = vm.and.i32 %12, %eq : i32
+    %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref<!hal.device>
+    %16 = vm.add.i64 %2, %c1_1 : i64
+    vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref<!hal.device>)
+  ^bb5:  // pred: ^bb1
+    vm.cond_br %5, ^bb6, ^bb7
+  ^bb6:  // pred: ^bb5
+    vm.fail %c18, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>"
+  ^bb7:  // pred: ^bb5
+    %_utf8_hal_executable_format_E03EECB63A2AAF52_5 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer
+    %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer
+    %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_E03EECB63A2AAF52_5, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
+    %nz_7 = vm.cmp.nz.i64 %17#1 : i64
+    %18 = vm.select.i32 %17#0, %nz_7, %zero : i32
+    %19 = vm.select.i64 %18, %zero_0, %c-1 : i64
+    %eq_8 = vm.cmp.eq.i64 %19, %zero_0 : i64
+    vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
+    vm.cond_br %eq_8, ^bb8, ^bb9
+  ^bb8:  // pred: ^bb7
+    %multiple_results_dispatch_0_embedded_elf_arm_64 = vm.const.ref.rodata @multiple_results_dispatch_0_embedded_elf_arm_64 : !vm.buffer
+    %ref_9 = vm.call @hal.executable.create(%4, %c-1, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6, %multiple_results_dispatch_0_embedded_elf_arm_64, %null) {nosideeffects} : (!vm.ref<!hal.device>, i64, !vm.buffer, !vm.buffer, !vm.buffer) -> !vm.ref<!hal.executable>
+    vm.global.store.ref %ref_9, @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref<!hal.executable>
+    %ref_10 = vm.call @__multiple_results_memoize_apply() : () -> !vm.ref<!hal.command_buffer>
+    vm.global.store.ref %ref_10, @__multiple_results_memoize_result_0_device_0 : !vm.ref<!hal.command_buffer>
+    vm.return
+  ^bb9:  // pred: ^bb7
+    vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+  }
+}
+
+// -----// IR Dump After ConvertToYieldableCallsPass (iree-vm-convert-to-yieldable-calls) //----- //
+vm.module public @module {
+  vm.global.ref private mutable @__device_0 : !vm.ref<!hal.device>
+  vm.global.ref private mutable @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref<!hal.executable>
+  vm.global.ref private mutable @__multiple_results_memoize_result_0_device_0 : !vm.ref<!hal.command_buffer>
+  vm.rodata private @_utf8_hal_device_id_C6650FF277232B5A {alignment = 1 : i64} "hal.device.id"
+  vm.rodata private @_utf8_local_1A8FF0278D7661D8 {alignment = 1 : i64} "local*"
+  vm.rodata private @_utf8_hal_executable_format_E03EECB63A2AAF52 {alignment = 1 : i64} "hal.executable.format"
+  vm.rodata private @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 {alignment = 1 : i64} "embedded-elf-arm_64"
+  vm.rodata private @multiple_results_dispatch_0_embedded_elf_arm_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8>
+  vm.func private @__multiple_results_memoize_apply() -> !vm.ref<!hal.command_buffer> attributes {inlining_policy = #util.inline.never, vm.unwind} {
+    %c13 = vm.const.i32 13
+    %c28 = vm.const.i32 28
+    %c2 = vm.const.i32 2
+    %null = vm.const.ref.zero : !vm.ref<!hal.buffer>
+    %c1 = vm.const.i32 1
+    %c3 = vm.const.i32 3
+    %c64 = vm.const.i32 64
+    %c128 = vm.const.i64 128
+    %c8 = vm.const.i64 8
+    %zero = vm.const.i64.zero
+    %zero_0 = vm.const.i32.zero
+    %c-1 = vm.const.i64 -1
+    %__device_0 = vm.global.load.ref @__device_0 : !vm.ref<!hal.device>
+    %__device_0_executable_0_multiple_results_dispatch_0 = vm.global.load.ref @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref<!hal.executable>
+    %ref = vm.call @hal.command_buffer.create(%__device_0, %zero_0, %c3, %c-1, %c3) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
+    vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%zero_0], [(%zero_0, %zero_0, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64, i32 ..., tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
+    vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%c64], [(%zero_0, %c1, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64, i32 ..., tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
+    vm.call @hal.command_buffer.execution_barrier(%ref, %c28, %c13, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i64) -> ()
+    vm.call @hal.command_buffer.finalize(%ref) : (!vm.ref<!hal.command_buffer>) -> ()
+    vm.return %ref : !vm.ref<!hal.command_buffer>
+  }
+  vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32)
+  vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects}
+  vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...)
+  vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
+  vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 6 : i32}
+  vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>)
+  vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i64)
+  vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
+  vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects}
+  vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects}
+  vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64, %flags : i64) -> !vm.ref<!hal.buffer>
+  vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffer : !vm.ref<!hal.command_buffer>, %flags : i64, %binding_table : tuple<!vm.ref<!hal.buffer>, i64, i64> ...)
+  vm.import private @hal.devices.count() -> i32 attributes {nosideeffects}
+  vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects}
+  vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref<!hal.executable> attributes {nosideeffects}
+  vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i64) -> !vm.ref<!hal.fence>
+  vm.import private @hal.fence.await(%timeout_millis : i32, %flags : i64, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {vm.yield}
+  vm.rodata private @_utf8_input0_DCE99660CEB3F6B {alignment = 1 : i64} "input0"
+  vm.rodata private @_utf8_tensor_FC1814BC4A58F22A {alignment = 1 : i64} "tensor"
+  vm.rodata private @_utf8_input1_B898B726583C85DA {alignment = 1 : i64} "input1"
+  vm.func private @multiple_results(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> (!vm.ref<!hal.buffer_view>, !vm.ref<!hal.buffer_view>) attributes {iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}, vm.unwind, vm.yield} {
+    %c16 = vm.const.i32 16
+    %c1 = vm.const.i32 1
+    %c553648160 = vm.const.i32 553648160
+    %c3075 = vm.const.i32 3075
+    %c48 = vm.const.i32 48
+    %c2 = vm.const.i64 2
+    %c8 = vm.const.i64 8
+    %c64 = vm.const.i64 64
+    %c128 = vm.const.i64 128
+    %zero = vm.const.i64.zero
+    %c-1 = vm.const.i64 -1
+    %null = vm.const.ref.zero : !vm.ref<!hal.fence>
+    %c-1_0 = vm.const.i32 -1
+    %__device_0 = vm.global.load.ref @__device_0 : !vm.ref<!hal.device>
+    %__multiple_results_memoize_result_0_device_0 = vm.global.load.ref @__multiple_results_memoize_result_0_device_0 : !vm.ref<!hal.command_buffer>
+    %_utf8_input0_DCE99660CEB3F6B = vm.const.ref.rodata @_utf8_input0_DCE99660CEB3F6B : !vm.buffer
+    vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DCE99660CEB3F6B, %c553648160, %c1, [%c2]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
+    %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
+    %ref_1 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
+    %_utf8_tensor_FC1814BC4A58F22A = vm.const.ref.rodata @_utf8_tensor_FC1814BC4A58F22A : !vm.buffer
+    vm.call @hal.buffer.assert(%ref, %_utf8_tensor_FC1814BC4A58F22A, %ref_1, %c8, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
+    %_utf8_input1_B898B726583C85DA = vm.const.ref.rodata @_utf8_input1_B898B726583C85DA : !vm.buffer
+    vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_B898B726583C85DA, %c553648160, %c1, [%c2]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
+    %ref_2 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
+    vm.call @hal.buffer.assert(%ref_2, %_utf8_tensor_FC1814BC4A58F22A, %ref_1, %c8, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
+    %ref_3 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i64) -> !vm.ref<!hal.fence>
+    %ref_4 = vm.call @hal.device.queue.alloca(%__device_0, %c-1, %null, %ref_3, %zero, %c48, %c3075, %c128, %zero) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i64, i32, i32, i64, i64) -> !vm.ref<!hal.buffer>
+    %ref_5 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i64) -> !vm.ref<!hal.fence>
+    vm.call.variadic @hal.device.queue.execute.indirect(%__device_0, %c-1, %ref_3, %ref_5, %__multiple_results_memoize_result_0_device_0, %zero, [(%ref, %zero, %c8), (%ref_2, %zero, %c8), (%ref_4, %zero, %c128)]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer>, i64, tuple<!vm.ref<!hal.buffer>, i64, i64> ...)
+    vm.call.variadic.yieldable @hal.fence.await(%c-1_0, %zero, %ref_5) {segment_sizes = dense<[-1, -1, 1]> : vector<3xi16>, segment_types = [i32, i64, !vm.ref<!hal.fence>]} : (i32, i64, !vm.ref<!hal.fence>) -> ^bb1 (i32)
+  ^bb1(%0: i32):  // pred: ^bb0
+    vm.cond_br %0, ^bb3, ^bb2
+  ^bb2:  // pred: ^bb1
+    %ref_6 = vm.call.variadic @hal.buffer_view.create(%ref_4, %zero, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
+    %ref_7 = vm.call.variadic @hal.buffer_view.create(%ref_4, %c64, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
+    vm.return %ref_6, %ref_7 : !vm.ref<!hal.buffer_view>, !vm.ref<!hal.buffer_view>
+  ^bb3:  // pred: ^bb1
+    vm.fail %0, "failed to wait on timepoint"
+  }
+  vm.export @multiple_results attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}}
+  vm.export @__init
+  vm.func private @__init() attributes {vm.unwind} {
+    %c1 = vm.const.i32 1
+    %null = vm.const.ref.zero : !vm.buffer
+    %c14 = vm.const.i32 14
+    %c-1 = vm.const.i64 -1
+    %c18 = vm.const.i32 18
+    %zero = vm.const.i32.zero
+    %zero_0 = vm.const.i64.zero
+    %c1_1 = vm.const.i64 1
+    %null_2 = vm.const.ref.zero : !vm.ref<!hal.device>
+    %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
+    %1 = vm.ext.i32.i64.s %0 : i32 -> i64
+    vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref<!hal.device>)
+  ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
+    %rnz = vm.cmp.nz.ref %4 : !vm.ref<!hal.device>
+    %5 = vm.xor.i32 %rnz, %c1 : i32
+    %slt = vm.cmp.lt.i64.s %2, %1 : i64
+    %6 = vm.and.i32 %5, %slt : i32
+    vm.cond_br %6, ^bb2, ^bb5
+  ^bb2:  // pred: ^bb1
+    %7 = vm.trunc.i64.i32 %2 : i64 -> i32
+    %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
+    %_utf8_hal_device_id_C6650FF277232B5A = vm.const.ref.rodata @_utf8_hal_device_id_C6650FF277232B5A : !vm.buffer
+    %_utf8_local_1A8FF0278D7661D8 = vm.const.ref.rodata @_utf8_local_1A8FF0278D7661D8 : !vm.buffer
+    %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_C6650FF277232B5A, %_utf8_local_1A8FF0278D7661D8) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
+    %nz = vm.cmp.nz.i64 %8#1 : i64
+    %9 = vm.select.i32 %8#0, %nz, %zero : i32
+    vm.cond_br %9, ^bb3, ^bb4(%zero : i32)
+  ^bb3:  // pred: ^bb2
+    %_utf8_hal_executable_format_E03EECB63A2AAF52 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer
+    %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer
+    %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_E03EECB63A2AAF52, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
+    %nz_3 = vm.cmp.nz.i64 %10#1 : i64
+    %11 = vm.select.i32 %10#0, %nz_3, %zero : i32
+    vm.br ^bb4(%11 : i32)
+  ^bb4(%12: i32):  // 2 preds: ^bb2, ^bb3
+    %eq = vm.cmp.eq.i64 %3, %zero_0 : i64
+    %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64
+    %14 = vm.add.i64 %3, %13 : i64
+    %15 = vm.and.i32 %12, %eq : i32
+    %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref<!hal.device>
+    %16 = vm.add.i64 %2, %c1_1 : i64
+    vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref<!hal.device>)
+  ^bb5:  // pred: ^bb1
+    vm.cond_br %5, ^bb6, ^bb7
+  ^bb6:  // pred: ^bb5
+    vm.fail %c18, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>"
+  ^bb7:  // pred: ^bb5
+    %_utf8_hal_executable_format_E03EECB63A2AAF52_5 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer
+    %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer
+    %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_E03EECB63A2AAF52_5, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
+    %nz_7 = vm.cmp.nz.i64 %17#1 : i64
+    %18 = vm.select.i32 %17#0, %nz_7, %zero : i32
+    %19 = vm.select.i64 %18, %zero_0, %c-1 : i64
+    %eq_8 = vm.cmp.eq.i64 %19, %zero_0 : i64
+    vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
+    vm.cond_br %eq_8, ^bb8, ^bb9
+  ^bb8:  // pred: ^bb7
+    %multiple_results_dispatch_0_embedded_elf_arm_64 = vm.const.ref.rodata @multiple_results_dispatch_0_embedded_elf_arm_64 : !vm.buffer
+    %ref_9 = vm.call @hal.executable.create(%4, %c-1, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6, %multiple_results_dispatch_0_embedded_elf_arm_64, %null) {nosideeffects} : (!vm.ref<!hal.device>, i64, !vm.buffer, !vm.buffer, !vm.buffer) -> !vm.ref<!hal.executable>
+    vm.global.store.ref %ref_9, @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref<!hal.executable>
+    %ref_10 = vm.call @__multiple_results_memoize_apply() : () -> !vm.ref<!hal.command_buffer>
+    vm.global.store.ref %ref_10, @__multiple_results_memoize_result_0_device_0 : !vm.ref<!hal.command_buffer>
+    vm.return
+  ^bb9:  // pred: ^bb7
+    vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+  }
+}
+
+// -----// IR Dump After DropOptimizationBarriersPass (iree-vm-drop-optimization-barriers) //----- //
+vm.module public @module {
+  vm.global.ref private mutable @__device_0 : !vm.ref<!hal.device>
+  vm.global.ref private mutable @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref<!hal.executable>
+  vm.global.ref private mutable @__multiple_results_memoize_result_0_device_0 : !vm.ref<!hal.command_buffer>
+  vm.rodata private @_utf8_hal_device_id_C6650FF277232B5A {alignment = 1 : i64} "hal.device.id"
+  vm.rodata private @_utf8_local_1A8FF0278D7661D8 {alignment = 1 : i64} "local*"
+  vm.rodata private @_utf8_hal_executable_format_E03EECB63A2AAF52 {alignment = 1 : i64} "hal.executable.format"
+  vm.rodata private @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 {alignment = 1 : i64} "embedded-elf-arm_64"
+  vm.rodata private @multiple_results_dispatch_0_embedded_elf_arm_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8>
+  vm.func private @__multiple_results_memoize_apply() -> !vm.ref<!hal.command_buffer> attributes {inlining_policy = #util.inline.never, vm.unwind} {
+    %c13 = vm.const.i32 13
+    %c28 = vm.const.i32 28
+    %c2 = vm.const.i32 2
+    %null = vm.const.ref.zero : !vm.ref<!hal.buffer>
+    %c1 = vm.const.i32 1
+    %c3 = vm.const.i32 3
+    %c64 = vm.const.i32 64
+    %c128 = vm.const.i64 128
+    %c8 = vm.const.i64 8
+    %zero = vm.const.i64.zero
+    %zero_0 = vm.const.i32.zero
+    %c-1 = vm.const.i64 -1
+    %__device_0 = vm.global.load.ref @__device_0 : !vm.ref<!hal.device>
+    %__device_0_executable_0_multiple_results_dispatch_0 = vm.global.load.ref @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref<!hal.executable>
+    %ref = vm.call @hal.command_buffer.create(%__device_0, %zero_0, %c3, %c-1, %c3) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
+    vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%zero_0], [(%zero_0, %zero_0, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64, i32 ..., tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
+    vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%c64], [(%zero_0, %c1, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64, i32 ..., tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
+    vm.call @hal.command_buffer.execution_barrier(%ref, %c28, %c13, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i64) -> ()
+    vm.call @hal.command_buffer.finalize(%ref) : (!vm.ref<!hal.command_buffer>) -> ()
+    vm.return %ref : !vm.ref<!hal.command_buffer>
+  }
+  vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32)
+  vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects}
+  vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...)
+  vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
+  vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 6 : i32}
+  vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>)
+  vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i64)
+  vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
+  vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects}
+  vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects}
+  vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64, %flags : i64) -> !vm.ref<!hal.buffer>
+  vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffer : !vm.ref<!hal.command_buffer>, %flags : i64, %binding_table : tuple<!vm.ref<!hal.buffer>, i64, i64> ...)
+  vm.import private @hal.devices.count() -> i32 attributes {nosideeffects}
+  vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects}
+  vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref<!hal.executable> attributes {nosideeffects}
+  vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i64) -> !vm.ref<!hal.fence>
+  vm.import private @hal.fence.await(%timeout_millis : i32, %flags : i64, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {vm.yield}
+  vm.rodata private @_utf8_input0_DCE99660CEB3F6B {alignment = 1 : i64} "input0"
+  vm.rodata private @_utf8_tensor_FC1814BC4A58F22A {alignment = 1 : i64} "tensor"
+  vm.rodata private @_utf8_input1_B898B726583C85DA {alignment = 1 : i64} "input1"
+  vm.func private @multiple_results(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> (!vm.ref<!hal.buffer_view>, !vm.ref<!hal.buffer_view>) attributes {iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}, vm.unwind, vm.yield} {
+    %c16 = vm.const.i32 16
+    %c1 = vm.const.i32 1
+    %c553648160 = vm.const.i32 553648160
+    %c3075 = vm.const.i32 3075
+    %c48 = vm.const.i32 48
+    %c2 = vm.const.i64 2
+    %c8 = vm.const.i64 8
+    %c64 = vm.const.i64 64
+    %c128 = vm.const.i64 128
+    %zero = vm.const.i64.zero
+    %c-1 = vm.const.i64 -1
+    %null = vm.const.ref.zero : !vm.ref<!hal.fence>
+    %c-1_0 = vm.const.i32 -1
+    %__device_0 = vm.global.load.ref @__device_0 : !vm.ref<!hal.device>
+    %__multiple_results_memoize_result_0_device_0 = vm.global.load.ref @__multiple_results_memoize_result_0_device_0 : !vm.ref<!hal.command_buffer>
+    %_utf8_input0_DCE99660CEB3F6B = vm.const.ref.rodata @_utf8_input0_DCE99660CEB3F6B : !vm.buffer
+    vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DCE99660CEB3F6B, %c553648160, %c1, [%c2]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
+    %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
+    %ref_1 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
+    %_utf8_tensor_FC1814BC4A58F22A = vm.const.ref.rodata @_utf8_tensor_FC1814BC4A58F22A : !vm.buffer
+    vm.call @hal.buffer.assert(%ref, %_utf8_tensor_FC1814BC4A58F22A, %ref_1, %c8, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
+    %_utf8_input1_B898B726583C85DA = vm.const.ref.rodata @_utf8_input1_B898B726583C85DA : !vm.buffer
+    vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_B898B726583C85DA, %c553648160, %c1, [%c2]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
+    %ref_2 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
+    vm.call @hal.buffer.assert(%ref_2, %_utf8_tensor_FC1814BC4A58F22A, %ref_1, %c8, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
+    %ref_3 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i64) -> !vm.ref<!hal.fence>
+    %ref_4 = vm.call @hal.device.queue.alloca(%__device_0, %c-1, %null, %ref_3, %zero, %c48, %c3075, %c128, %zero) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i64, i32, i32, i64, i64) -> !vm.ref<!hal.buffer>
+    %ref_5 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i64) -> !vm.ref<!hal.fence>
+    vm.call.variadic @hal.device.queue.execute.indirect(%__device_0, %c-1, %ref_3, %ref_5, %__multiple_results_memoize_result_0_device_0, %zero, [(%ref, %zero, %c8), (%ref_2, %zero, %c8), (%ref_4, %zero, %c128)]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer>, i64, tuple<!vm.ref<!hal.buffer>, i64, i64> ...)
+    vm.call.variadic.yieldable @hal.fence.await(%c-1_0, %zero, %ref_5) {segment_sizes = dense<[-1, -1, 1]> : vector<3xi16>, segment_types = [i32, i64, !vm.ref<!hal.fence>]} : (i32, i64, !vm.ref<!hal.fence>) -> ^bb1 (i32)
+  ^bb1(%0: i32):  // pred: ^bb0
+    vm.cond_br %0, ^bb3, ^bb2
+  ^bb2:  // pred: ^bb1
+    %ref_6 = vm.call.variadic @hal.buffer_view.create(%ref_4, %zero, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
+    %ref_7 = vm.call.variadic @hal.buffer_view.create(%ref_4, %c64, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
+    vm.return %ref_6, %ref_7 : !vm.ref<!hal.buffer_view>, !vm.ref<!hal.buffer_view>
+  ^bb3:  // pred: ^bb1
+    vm.fail %0, "failed to wait on timepoint"
+  }
+  vm.export @multiple_results attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}}
+  vm.export @__init
+  vm.func private @__init() attributes {vm.unwind} {
+    %c1 = vm.const.i32 1
+    %null = vm.const.ref.zero : !vm.buffer
+    %c14 = vm.const.i32 14
+    %c-1 = vm.const.i64 -1
+    %c18 = vm.const.i32 18
+    %zero = vm.const.i32.zero
+    %zero_0 = vm.const.i64.zero
+    %c1_1 = vm.const.i64 1
+    %null_2 = vm.const.ref.zero : !vm.ref<!hal.device>
+    %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
+    %1 = vm.ext.i32.i64.s %0 : i32 -> i64
+    vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref<!hal.device>)
+  ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
+    %rnz = vm.cmp.nz.ref %4 : !vm.ref<!hal.device>
+    %5 = vm.xor.i32 %rnz, %c1 : i32
+    %slt = vm.cmp.lt.i64.s %2, %1 : i64
+    %6 = vm.and.i32 %5, %slt : i32
+    vm.cond_br %6, ^bb2, ^bb5
+  ^bb2:  // pred: ^bb1
+    %7 = vm.trunc.i64.i32 %2 : i64 -> i32
+    %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
+    %_utf8_hal_device_id_C6650FF277232B5A = vm.const.ref.rodata @_utf8_hal_device_id_C6650FF277232B5A : !vm.buffer
+    %_utf8_local_1A8FF0278D7661D8 = vm.const.ref.rodata @_utf8_local_1A8FF0278D7661D8 : !vm.buffer
+    %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_C6650FF277232B5A, %_utf8_local_1A8FF0278D7661D8) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
+    %nz = vm.cmp.nz.i64 %8#1 : i64
+    %9 = vm.select.i32 %8#0, %nz, %zero : i32
+    vm.cond_br %9, ^bb3, ^bb4(%zero : i32)
+  ^bb3:  // pred: ^bb2
+    %_utf8_hal_executable_format_E03EECB63A2AAF52 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer
+    %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer
+    %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_E03EECB63A2AAF52, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
+    %nz_3 = vm.cmp.nz.i64 %10#1 : i64
+    %11 = vm.select.i32 %10#0, %nz_3, %zero : i32
+    vm.br ^bb4(%11 : i32)
+  ^bb4(%12: i32):  // 2 preds: ^bb2, ^bb3
+    %eq = vm.cmp.eq.i64 %3, %zero_0 : i64
+    %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64
+    %14 = vm.add.i64 %3, %13 : i64
+    %15 = vm.and.i32 %12, %eq : i32
+    %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref<!hal.device>
+    %16 = vm.add.i64 %2, %c1_1 : i64
+    vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref<!hal.device>)
+  ^bb5:  // pred: ^bb1
+    vm.cond_br %5, ^bb6, ^bb7
+  ^bb6:  // pred: ^bb5
+    vm.fail %c18, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>"
+  ^bb7:  // pred: ^bb5
+    %_utf8_hal_executable_format_E03EECB63A2AAF52_5 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer
+    %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer
+    %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_E03EECB63A2AAF52_5, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
+    %nz_7 = vm.cmp.nz.i64 %17#1 : i64
+    %18 = vm.select.i32 %17#0, %nz_7, %zero : i32
+    %19 = vm.select.i64 %18, %zero_0, %c-1 : i64
+    %eq_8 = vm.cmp.eq.i64 %19, %zero_0 : i64
+    vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
+    vm.cond_br %eq_8, ^bb8, ^bb9
+  ^bb8:  // pred: ^bb7
+    %multiple_results_dispatch_0_embedded_elf_arm_64 = vm.const.ref.rodata @multiple_results_dispatch_0_embedded_elf_arm_64 : !vm.buffer
+    %ref_9 = vm.call @hal.executable.create(%4, %c-1, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6, %multiple_results_dispatch_0_embedded_elf_arm_64, %null) {nosideeffects} : (!vm.ref<!hal.device>, i64, !vm.buffer, !vm.buffer, !vm.buffer) -> !vm.ref<!hal.executable>
+    vm.global.store.ref %ref_9, @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref<!hal.executable>
+    %ref_10 = vm.call @__multiple_results_memoize_apply() : () -> !vm.ref<!hal.command_buffer>
+    vm.global.store.ref %ref_10, @__multiple_results_memoize_result_0_device_0 : !vm.ref<!hal.command_buffer>
+    vm.return
+  ^bb9:  // pred: ^bb7
+    vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+  }
+}
+
+// -----// IR Dump After MaterializeRefDiscardsPass (iree-vm-materialize-ref-discards) //----- //
+vm.module public @module {
+  vm.global.ref private mutable @__device_0 : !vm.ref<!hal.device>
+  vm.global.ref private mutable @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref<!hal.executable>
+  vm.global.ref private mutable @__multiple_results_memoize_result_0_device_0 : !vm.ref<!hal.command_buffer>
+  vm.rodata private @_utf8_hal_device_id_C6650FF277232B5A {alignment = 1 : i64} "hal.device.id"
+  vm.rodata private @_utf8_local_1A8FF0278D7661D8 {alignment = 1 : i64} "local*"
+  vm.rodata private @_utf8_hal_executable_format_E03EECB63A2AAF52 {alignment = 1 : i64} "hal.executable.format"
+  vm.rodata private @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 {alignment = 1 : i64} "embedded-elf-arm_64"
+  vm.rodata private @multiple_results_dispatch_0_embedded_elf_arm_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8>
+  vm.func private @__multiple_results_memoize_apply() -> !vm.ref<!hal.command_buffer> attributes {inlining_policy = #util.inline.never, vm.unwind} {
+    %c13 = vm.const.i32 13
+    %c28 = vm.const.i32 28
+    %c2 = vm.const.i32 2
+    %null = vm.const.ref.zero : !vm.ref<!hal.buffer>
+    %c1 = vm.const.i32 1
+    %c3 = vm.const.i32 3
+    %c64 = vm.const.i32 64
+    %c128 = vm.const.i64 128
+    %c8 = vm.const.i64 8
+    %zero = vm.const.i64.zero
+    %zero_0 = vm.const.i32.zero
+    %c-1 = vm.const.i64 -1
+    %__device_0 = vm.global.load.ref @__device_0 : !vm.ref<!hal.device>
+    %__device_0_executable_0_multiple_results_dispatch_0 = vm.global.load.ref @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref<!hal.executable>
+    %ref = vm.call @hal.command_buffer.create(%__device_0, %zero_0, %c3, %c-1, %c3) : (!vm.ref<!hal.device>, i32, i32, i64, i32) -> !vm.ref<!hal.command_buffer>
+    vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%zero_0], [(%zero_0, %zero_0, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64, i32 ..., tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
+    vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%c64], [(%zero_0, %c1, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32, i64, i32 ..., tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
+    vm.call @hal.command_buffer.execution_barrier(%ref, %c28, %c13, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i64) -> ()
+    vm.call @hal.command_buffer.finalize(%ref) : (!vm.ref<!hal.command_buffer>) -> ()
+    vm.return %ref : !vm.ref<!hal.command_buffer>
+  }
+  vm.import private @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32)
+  vm.import private @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects}
+  vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...)
+  vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects}
+  vm.import private @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref<!hal.command_buffer> attributes {minimum_version = 6 : i32}
+  vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref<!hal.command_buffer>)
+  vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i64)
+  vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple<i32, i32, !vm.ref<!hal.buffer>, i64, i64> ...)
+  vm.import private @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects}
+  vm.import private @hal.device.query.i64(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects}
+  vm.import private @hal.device.queue.alloca(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %pool : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64, %flags : i64) -> !vm.ref<!hal.buffer>
+  vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %wait_fence : !vm.ref<!hal.fence>, %signal_fence : !vm.ref<!hal.fence>, %command_buffer : !vm.ref<!hal.command_buffer>, %flags : i64, %binding_table : tuple<!vm.ref<!hal.buffer>, i64, i64> ...)
+  vm.import private @hal.devices.count() -> i32 attributes {nosideeffects}
+  vm.import private @hal.devices.get(%index : i32) -> !vm.ref<!hal.device> attributes {nosideeffects}
+  vm.import private @hal.executable.create(%device : !vm.ref<!hal.device>, %queue_affinity : i64, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref<!hal.executable> attributes {nosideeffects}
+  vm.import private @hal.fence.create(%device : !vm.ref<!hal.device>, %flags : i64) -> !vm.ref<!hal.fence>
+  vm.import private @hal.fence.await(%timeout_millis : i32, %flags : i64, %fences : !vm.ref<!hal.fence> ...) -> i32 attributes {vm.yield}
+  vm.rodata private @_utf8_input0_DCE99660CEB3F6B {alignment = 1 : i64} "input0"
+  vm.rodata private @_utf8_tensor_FC1814BC4A58F22A {alignment = 1 : i64} "tensor"
+  vm.rodata private @_utf8_input1_B898B726583C85DA {alignment = 1 : i64} "input1"
+  vm.func private @multiple_results(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> (!vm.ref<!hal.buffer_view>, !vm.ref<!hal.buffer_view>) attributes {iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}, vm.unwind, vm.yield} {
+    %c16 = vm.const.i32 16
+    %c1 = vm.const.i32 1
+    %c553648160 = vm.const.i32 553648160
+    %c3075 = vm.const.i32 3075
+    %c48 = vm.const.i32 48
+    %c2 = vm.const.i64 2
+    %c8 = vm.const.i64 8
+    %c64 = vm.const.i64 64
+    %c128 = vm.const.i64 128
+    %zero = vm.const.i64.zero
+    %c-1 = vm.const.i64 -1
+    %null = vm.const.ref.zero : !vm.ref<!hal.fence>
+    %c-1_0 = vm.const.i32 -1
+    %__device_0 = vm.global.load.ref @__device_0 : !vm.ref<!hal.device>
+    %__multiple_results_memoize_result_0_device_0 = vm.global.load.ref @__multiple_results_memoize_result_0_device_0 : !vm.ref<!hal.command_buffer>
+    %_utf8_input0_DCE99660CEB3F6B = vm.const.ref.rodata @_utf8_input0_DCE99660CEB3F6B : !vm.buffer
+    vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DCE99660CEB3F6B, %c553648160, %c1, [%c2]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
+    %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
+    %ref_1 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
+    %_utf8_tensor_FC1814BC4A58F22A = vm.const.ref.rodata @_utf8_tensor_FC1814BC4A58F22A : !vm.buffer
+    vm.call @hal.buffer.assert(%ref, %_utf8_tensor_FC1814BC4A58F22A, %ref_1, %c8, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
+    %_utf8_input1_B898B726583C85DA = vm.const.ref.rodata @_utf8_input1_B898B726583C85DA : !vm.buffer
+    vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_B898B726583C85DA, %c553648160, %c1, [%c2]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
+    %ref_2 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
+    vm.call @hal.buffer.assert(%ref_2, %_utf8_tensor_FC1814BC4A58F22A, %ref_1, %c8, %c16, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
+    %ref_3 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i64) -> !vm.ref<!hal.fence>
+    %ref_4 = vm.call @hal.device.queue.alloca(%__device_0, %c-1, %null, %ref_3, %zero, %c48, %c3075, %c128, %zero) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, i64, i32, i32, i64, i64) -> !vm.ref<!hal.buffer>
+    %ref_5 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref<!hal.device>, i64) -> !vm.ref<!hal.fence>
+    vm.call.variadic @hal.device.queue.execute.indirect(%__device_0, %c-1, %ref_3, %ref_5, %__multiple_results_memoize_result_0_device_0, %zero, [(%ref, %zero, %c8), (%ref_2, %zero, %c8), (%ref_4, %zero, %c128)]) : (!vm.ref<!hal.device>, i64, !vm.ref<!hal.fence>, !vm.ref<!hal.fence>, !vm.ref<!hal.command_buffer>, i64, tuple<!vm.ref<!hal.buffer>, i64, i64> ...)
+    vm.call.variadic.yieldable @hal.fence.await(%c-1_0, %zero, %ref_5) {segment_sizes = dense<[-1, -1, 1]> : vector<3xi16>, segment_types = [i32, i64, !vm.ref<!hal.fence>]} : (i32, i64, !vm.ref<!hal.fence>) -> ^bb1 (i32)
+  ^bb1(%0: i32):  // pred: ^bb0
+    vm.cond_br %0, ^bb3, ^bb2
+  ^bb2:  // pred: ^bb1
+    %ref_6 = vm.call.variadic @hal.buffer_view.create(%ref_4, %zero, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
+    %ref_7 = vm.call.variadic @hal.buffer_view.create(%ref_4, %c64, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
+    vm.return %ref_6, %ref_7 : !vm.ref<!hal.buffer_view>, !vm.ref<!hal.buffer_view>
+  ^bb3:  // pred: ^bb1
+    vm.discard.refs %ref_4 : !vm.ref<!hal.buffer>
+    vm.fail %0, "failed to wait on timepoint"
+  }
+  vm.export @multiple_results attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}}
+  vm.export @__init
+  vm.func private @__init() attributes {vm.unwind} {
+    %c1 = vm.const.i32 1
+    %null = vm.const.ref.zero : !vm.buffer
+    %c14 = vm.const.i32 14
+    %c-1 = vm.const.i64 -1
+    %c18 = vm.const.i32 18
+    %zero = vm.const.i32.zero
+    %zero_0 = vm.const.i64.zero
+    %c1_1 = vm.const.i64 1
+    %null_2 = vm.const.ref.zero : !vm.ref<!hal.device>
+    %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32
+    %1 = vm.ext.i32.i64.s %0 : i32 -> i64
+    vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref<!hal.device>)
+  ^bb1(%2: i64, %3: i64, %4: !vm.ref<!hal.device>):  // 2 preds: ^bb0, ^bb4
+    %rnz = vm.cmp.nz.ref %4 : !vm.ref<!hal.device>
+    %5 = vm.xor.i32 %rnz, %c1 : i32
+    %slt = vm.cmp.lt.i64.s %2, %1 : i64
+    %6 = vm.and.i32 %5, %slt : i32
+    vm.cond_br %6, ^bb2, ^bb5
+  ^bb2:  // pred: ^bb1
+    vm.discard.refs %4 : !vm.ref<!hal.device>
+    %7 = vm.trunc.i64.i32 %2 : i64 -> i32
+    %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
+    %_utf8_hal_device_id_C6650FF277232B5A = vm.const.ref.rodata @_utf8_hal_device_id_C6650FF277232B5A : !vm.buffer
+    %_utf8_local_1A8FF0278D7661D8 = vm.const.ref.rodata @_utf8_local_1A8FF0278D7661D8 : !vm.buffer
+    %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_C6650FF277232B5A, %_utf8_local_1A8FF0278D7661D8) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
+    %nz = vm.cmp.nz.i64 %8#1 : i64
+    %9 = vm.select.i32 %8#0, %nz, %zero : i32
+    vm.cond_br %9, ^bb3, ^bb4(%zero : i32)
+  ^bb3:  // pred: ^bb2
+    %_utf8_hal_executable_format_E03EECB63A2AAF52 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer
+    %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer
+    %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_E03EECB63A2AAF52, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
+    %nz_3 = vm.cmp.nz.i64 %10#1 : i64
+    %11 = vm.select.i32 %10#0, %nz_3, %zero : i32
+    vm.br ^bb4(%11 : i32)
+  ^bb4(%12: i32):  // 2 preds: ^bb2, ^bb3
+    %eq = vm.cmp.eq.i64 %3, %zero_0 : i64
+    %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64
+    %14 = vm.add.i64 %3, %13 : i64
+    %15 = vm.and.i32 %12, %eq : i32
+    %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref<!hal.device>
+    %16 = vm.add.i64 %2, %c1_1 : i64
+    vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref<!hal.device>)
+  ^bb5:  // pred: ^bb1
+    vm.discard.refs %null_2 : !vm.ref<!hal.device>
+    vm.cond_br %5, ^bb6, ^bb7
+  ^bb6:  // pred: ^bb5
+    vm.discard.refs %null, %4 : !vm.buffer, !vm.ref<!hal.device>
+    vm.fail %c18, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>"
+  ^bb7:  // pred: ^bb5
+    %_utf8_hal_executable_format_E03EECB63A2AAF52_5 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer
+    %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer
+    %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_E03EECB63A2AAF52_5, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
+    %nz_7 = vm.cmp.nz.i64 %17#1 : i64
+    %18 = vm.select.i32 %17#0, %nz_7, %zero : i32
+    %19 = vm.select.i64 %18, %zero_0, %c-1 : i64
+    %eq_8 = vm.cmp.eq.i64 %19, %zero_0 : i64
+    vm.global.store.ref %4, @__device_0 : !vm.ref<!hal.device>
+    vm.cond_br %eq_8, ^bb8, ^bb9
+  ^bb8:  // pred: ^bb7
+    %multiple_results_dispatch_0_embedded_elf_arm_64 = vm.const.ref.rodata @multiple_results_dispatch_0_embedded_elf_arm_64 : !vm.buffer
+    %ref_9 = vm.call @hal.executable.create(%4, %c-1, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6, %multiple_results_dispatch_0_embedded_elf_arm_64, %null) {nosideeffects} : (!vm.ref<!hal.device>, i64, !vm.buffer, !vm.buffer, !vm.buffer) -> !vm.ref<!hal.executable>
+    vm.global.store.ref %ref_9, @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref<!hal.executable>
+    %ref_10 = vm.call @__multiple_results_memoize_apply() : () -> !vm.ref<!hal.command_buffer>
+    vm.global.store.ref %ref_10, @__multiple_results_memoize_result_0_device_0 : !vm.ref<!hal.command_buffer>
+    vm.return
+  ^bb9:  // pred: ^bb7
+    vm.discard.refs %null, %4, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6 : !vm.buffer, !vm.ref<!hal.device>, !vm.buffer
+    vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]"
+  }
+}
+

From 9fb4c614de7d6617882d7bb9dc7dc2e12e540520 Mon Sep 17 00:00:00 2001
From: sparsh <sparshsingh@berkeley.edu>
Date: Fri, 16 Jan 2026 20:41:56 -0800
Subject: [PATCH 11/13] IREE: baseline attempt to compile Gemmini tile MLIR
 (expected fail log)

---
 .../mini_cnn_block.gemmini_tile.mlir          | 191 ++++++++++++++++++
 .../logs/mini_cnn_gemmini_tile.iree_fail.txt  |   4 +
 2 files changed, 195 insertions(+)
 create mode 100644 experiments/gemmini/iree_inputs/mini_cnn_block.gemmini_tile.mlir
 create mode 100644 experiments/iree/logs/mini_cnn_gemmini_tile.iree_fail.txt

diff --git a/experiments/gemmini/iree_inputs/mini_cnn_block.gemmini_tile.mlir b/experiments/gemmini/iree_inputs/mini_cnn_block.gemmini_tile.mlir
new file mode 100644
index 0000000..3a03623
--- /dev/null
+++ b/experiments/gemmini/iree_inputs/mini_cnn_block.gemmini_tile.mlir
@@ -0,0 +1,191 @@
+module {
+  func.func @mini_cnn_block(%arg0: memref<1x3x32x32xf32>, %arg1: memref<16x3x3x3xf32>, %arg2: memref<32x16x3x3xf32>, %arg3: memref<1x32x26x26xf32>) {
+    %alloc = memref.alloc() : memref<1x16x30x30xf32>
+    %alloc_0 = memref.alloc() : memref<1x32x26x26xf32>
+    %alloc_1 = memref.alloc() : memref<1x32x32x3xf32>
+    %alloc_2 = memref.alloc() : memref<27x16xf32>
+    %alloc_3 = memref.alloc() : memref<16xi32>
+    %alloc_4 = memref.alloc() : memref<900x16xf32>
+    %c30_i64 = arith.constant 30 : i64
+    %c3 = arith.constant 3 : index
+    %c3_5 = arith.constant 3 : index
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c1_6 = arith.constant 1 : index
+    scf.for %arg4 = %c0 to %c1 step %c1_6 {
+      %c0_27 = arith.constant 0 : index
+      %c3_28 = arith.constant 3 : index
+      %c1_29 = arith.constant 1 : index
+      scf.for %arg5 = %c0_27 to %c3_28 step %c1_29 {
+        %c0_30 = arith.constant 0 : index
+        %c32_31 = arith.constant 32 : index
+        %c1_32 = arith.constant 1 : index
+        scf.for %arg6 = %c0_30 to %c32_31 step %c1_32 {
+          %c0_33 = arith.constant 0 : index
+          %c32_34 = arith.constant 32 : index
+          %c1_35 = arith.constant 1 : index
+          scf.for %arg7 = %c0_33 to %c32_34 step %c1_35 {
+            %0 = memref.load %arg0[%arg4, %arg5, %arg6, %arg7] : memref<1x3x32x32xf32>
+            memref.store %0, %alloc_1[%arg4, %arg6, %arg7, %arg5] : memref<1x32x32x3xf32>
+          }
+        }
+      }
+    }
+    %c0_7 = arith.constant 0 : index
+    %c16 = arith.constant 16 : index
+    %c1_8 = arith.constant 1 : index
+    scf.for %arg4 = %c0_7 to %c16 step %c1_8 {
+      %c0_27 = arith.constant 0 : index
+      %c3_28 = arith.constant 3 : index
+      %c1_29 = arith.constant 1 : index
+      scf.for %arg5 = %c0_27 to %c3_28 step %c1_29 {
+        %c0_30 = arith.constant 0 : index
+        %c3_31 = arith.constant 3 : index
+        %c1_32 = arith.constant 1 : index
+        scf.for %arg6 = %c0_30 to %c3_31 step %c1_32 {
+          %c0_33 = arith.constant 0 : index
+          %c3_34 = arith.constant 3 : index
+          %c1_35 = arith.constant 1 : index
+          scf.for %arg7 = %c0_33 to %c3_34 step %c1_35 {
+            %0 = arith.muli %arg6, %c3 : index
+            %1 = arith.muli %0, %c3_5 : index
+            %2 = arith.muli %arg7, %c3_5 : index
+            %3 = arith.addi %1, %2 : index
+            %4 = arith.addi %3, %arg5 : index
+            %5 = memref.load %arg1[%arg4, %arg5, %arg6, %arg7] : memref<16x3x3x3xf32>
+            memref.store %5, %alloc_2[%4, %arg4] : memref<27x16xf32>
+          }
+        }
+      }
+    }
+    %c3_i64 = arith.constant 3 : i64
+    gemmini.tile_conv %alloc_1 %alloc_2 %alloc_3 %alloc_4 %c30_i64 %c30_i64 %c3_i64 : memref<1x32x32x3xf32> memref<27x16xf32> memref<16xi32> memref<900x16xf32> i64 i64 i64
+    %c0_9 = arith.constant 0 : index
+    %c1_10 = arith.constant 1 : index
+    %c1_11 = arith.constant 1 : index
+    scf.for %arg4 = %c0_9 to %c1_10 step %c1_11 {
+      %c0_27 = arith.constant 0 : index
+      %c16_28 = arith.constant 16 : index
+      %c1_29 = arith.constant 1 : index
+      scf.for %arg5 = %c0_27 to %c16_28 step %c1_29 {
+        %c0_30 = arith.constant 0 : index
+        %c30 = arith.constant 30 : index
+        %c1_31 = arith.constant 1 : index
+        scf.for %arg6 = %c0_30 to %c30 step %c1_31 {
+          %c0_32 = arith.constant 0 : index
+          %c30_33 = arith.constant 30 : index
+          %c1_34 = arith.constant 1 : index
+          scf.for %arg7 = %c0_32 to %c30_33 step %c1_34 {
+            %c30_35 = arith.constant 30 : index
+            %0 = arith.muli %arg4, %c30_35 : index
+            %1 = arith.muli %0, %c30_35 : index
+            %2 = arith.muli %arg6, %c30_35 : index
+            %3 = arith.addi %1, %2 : index
+            %4 = arith.addi %3, %arg7 : index
+            %5 = memref.load %alloc_4[%4, %arg5] : memref<900x16xf32>
+            memref.store %5, %alloc[%arg4, %arg5, %arg6, %arg7] : memref<1x16x30x30xf32>
+          }
+        }
+      }
+    }
+    memref.dealloc %alloc_1 : memref<1x32x32x3xf32>
+    memref.dealloc %alloc_2 : memref<27x16xf32>
+    memref.dealloc %alloc_4 : memref<900x16xf32>
+    memref.dealloc %alloc_3 : memref<16xi32>
+    %alloc_12 = memref.alloc() : memref<1x30x30x16xf32>
+    %alloc_13 = memref.alloc() : memref<144x32xf32>
+    %alloc_14 = memref.alloc() : memref<32xi32>
+    %alloc_15 = memref.alloc() : memref<676x32xf32>
+    %c26_i64 = arith.constant 26 : i64
+    %c3_16 = arith.constant 3 : index
+    %c16_17 = arith.constant 16 : index
+    %c0_18 = arith.constant 0 : index
+    %c1_19 = arith.constant 1 : index
+    %c1_20 = arith.constant 1 : index
+    scf.for %arg4 = %c0_18 to %c1_19 step %c1_20 {
+      %c0_27 = arith.constant 0 : index
+      %c16_28 = arith.constant 16 : index
+      %c1_29 = arith.constant 1 : index
+      scf.for %arg5 = %c0_27 to %c16_28 step %c1_29 {
+        %c0_30 = arith.constant 0 : index
+        %c30 = arith.constant 30 : index
+        %c1_31 = arith.constant 1 : index
+        scf.for %arg6 = %c0_30 to %c30 step %c1_31 {
+          %c0_32 = arith.constant 0 : index
+          %c30_33 = arith.constant 30 : index
+          %c1_34 = arith.constant 1 : index
+          scf.for %arg7 = %c0_32 to %c30_33 step %c1_34 {
+            %0 = memref.load %alloc[%arg4, %arg5, %arg6, %arg7] : memref<1x16x30x30xf32>
+            memref.store %0, %alloc_12[%arg4, %arg6, %arg7, %arg5] : memref<1x30x30x16xf32>
+          }
+        }
+      }
+    }
+    %c0_21 = arith.constant 0 : index
+    %c32 = arith.constant 32 : index
+    %c1_22 = arith.constant 1 : index
+    scf.for %arg4 = %c0_21 to %c32 step %c1_22 {
+      %c0_27 = arith.constant 0 : index
+      %c16_28 = arith.constant 16 : index
+      %c1_29 = arith.constant 1 : index
+      scf.for %arg5 = %c0_27 to %c16_28 step %c1_29 {
+        %c0_30 = arith.constant 0 : index
+        %c3_31 = arith.constant 3 : index
+        %c1_32 = arith.constant 1 : index
+        scf.for %arg6 = %c0_30 to %c3_31 step %c1_32 {
+          %c0_33 = arith.constant 0 : index
+          %c3_34 = arith.constant 3 : index
+          %c1_35 = arith.constant 1 : index
+          scf.for %arg7 = %c0_33 to %c3_34 step %c1_35 {
+            %0 = arith.muli %arg6, %c3_16 : index
+            %1 = arith.muli %0, %c16_17 : index
+            %2 = arith.muli %arg7, %c16_17 : index
+            %3 = arith.addi %1, %2 : index
+            %4 = arith.addi %3, %arg5 : index
+            %5 = memref.load %arg2[%arg4, %arg5, %arg6, %arg7] : memref<32x16x3x3xf32>
+            memref.store %5, %alloc_13[%4, %arg4] : memref<144x32xf32>
+          }
+        }
+      }
+    }
+    %c3_i64_23 = arith.constant 3 : i64
+    gemmini.tile_conv %alloc_12 %alloc_13 %alloc_14 %alloc_15 %c26_i64 %c26_i64 %c3_i64_23 : memref<1x30x30x16xf32> memref<144x32xf32> memref<32xi32> memref<676x32xf32> i64 i64 i64
+    %c0_24 = arith.constant 0 : index
+    %c1_25 = arith.constant 1 : index
+    %c1_26 = arith.constant 1 : index
+    scf.for %arg4 = %c0_24 to %c1_25 step %c1_26 {
+      %c0_27 = arith.constant 0 : index
+      %c32_28 = arith.constant 32 : index
+      %c1_29 = arith.constant 1 : index
+      scf.for %arg5 = %c0_27 to %c32_28 step %c1_29 {
+        %c0_30 = arith.constant 0 : index
+        %c26 = arith.constant 26 : index
+        %c1_31 = arith.constant 1 : index
+        scf.for %arg6 = %c0_30 to %c26 step %c1_31 {
+          %c0_32 = arith.constant 0 : index
+          %c26_33 = arith.constant 26 : index
+          %c1_34 = arith.constant 1 : index
+          scf.for %arg7 = %c0_32 to %c26_33 step %c1_34 {
+            %c26_35 = arith.constant 26 : index
+            %0 = arith.muli %arg4, %c26_35 : index
+            %1 = arith.muli %0, %c26_35 : index
+            %2 = arith.muli %arg6, %c26_35 : index
+            %3 = arith.addi %1, %2 : index
+            %4 = arith.addi %3, %arg7 : index
+            %5 = memref.load %alloc_15[%4, %arg5] : memref<676x32xf32>
+            memref.store %5, %alloc_0[%arg4, %arg5, %arg6, %arg7] : memref<1x32x26x26xf32>
+          }
+        }
+      }
+    }
+    memref.dealloc %alloc_12 : memref<1x30x30x16xf32>
+    memref.dealloc %alloc_13 : memref<144x32xf32>
+    memref.dealloc %alloc_15 : memref<676x32xf32>
+    memref.dealloc %alloc_14 : memref<32xi32>
+    linalg.copy ins(%alloc_0 : memref<1x32x26x26xf32>) outs(%arg3 : memref<1x32x26x26xf32>)
+    memref.dealloc %alloc : memref<1x16x30x30xf32>
+    memref.dealloc %alloc_0 : memref<1x32x26x26xf32>
+    return
+  }
+}
+
diff --git a/experiments/iree/logs/mini_cnn_gemmini_tile.iree_fail.txt b/experiments/iree/logs/mini_cnn_gemmini_tile.iree_fail.txt
new file mode 100644
index 0000000..d1d1619
--- /dev/null
+++ b/experiments/iree/logs/mini_cnn_gemmini_tile.iree_fail.txt
@@ -0,0 +1,4 @@
+/Users/sparshsingh/work/merlin/experiments/gemmini/iree_inputs/mini_cnn_block.gemmini_tile.mlir:62:5: error: Dialect `gemmini' not found for custom op 'gemmini.tile_conv' 
+    gemmini.tile_conv %alloc_1 %alloc_2 %alloc_3 %alloc_4 %c30_i64 %c30_i64 %c3_i64 : memref<1x32x32x3xf32> memref<27x16xf32> memref<16xi32> memref<900x16xf32> i64 i64 i64
+    ^
+/Users/sparshsingh/work/merlin/experiments/gemmini/iree_inputs/mini_cnn_block.gemmini_tile.mlir:62:5: note: Available dialects: affine, amdgpu, arith, arm_neon, arm_sme, arm_sve, bufferization, builtin, cf, check, chlo, complex, emitc, flow, func, gpu, hal, hal_inline, hal_loader, io_parameters, iree_codegen, iree_cpu, iree_encoding, iree_gpu, iree_linalg_ext, iree_tensor_ext, iree_vector_ext, linalg, llvm, math, memref, ml_program, nvgpu, nvvm, pcf, pdl, pdl_interp, quant, rocdl, scf, shape, shard, spirv, stablehlo, stream, tensor, tm_tensor, torch, torch_c, tosa, transform, ub, util, vector, vhlo, vm, vmvx ; for more info on dialect registration see https://mlir.llvm.org/getting_started/Faq/#registered-loaded-dependent-whats-up-with-dialects-management

From f0a03f6766ae93ea740076edd9cd13af7f12d975 Mon Sep 17 00:00:00 2001
From: sparsh <sparshsingh@berkeley.edu>
Date: Fri, 16 Jan 2026 20:47:12 -0800
Subject: [PATCH 12/13] IREE: baseline attempt to compile Gemmini tile MLIR
 (expected fail log)

---
 experiments/iree/logs/mini_cnn_gemmini_tile.iree_fail.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/experiments/iree/logs/mini_cnn_gemmini_tile.iree_fail.txt b/experiments/iree/logs/mini_cnn_gemmini_tile.iree_fail.txt
index d1d1619..5193e6f 100644
--- a/experiments/iree/logs/mini_cnn_gemmini_tile.iree_fail.txt
+++ b/experiments/iree/logs/mini_cnn_gemmini_tile.iree_fail.txt
@@ -1,4 +1,4 @@
-/Users/sparshsingh/work/merlin/experiments/gemmini/iree_inputs/mini_cnn_block.gemmini_tile.mlir:62:5: error: Dialect `gemmini' not found for custom op 'gemmini.tile_conv' 
+experiments/gemmini/iree_inputs/mini_cnn_block.gemmini_tile.mlir:62:5: error: Dialect `gemmini' not found for custom op 'gemmini.tile_conv' 
     gemmini.tile_conv %alloc_1 %alloc_2 %alloc_3 %alloc_4 %c30_i64 %c30_i64 %c3_i64 : memref<1x32x32x3xf32> memref<27x16xf32> memref<16xi32> memref<900x16xf32> i64 i64 i64
     ^
-/Users/sparshsingh/work/merlin/experiments/gemmini/iree_inputs/mini_cnn_block.gemmini_tile.mlir:62:5: note: Available dialects: affine, amdgpu, arith, arm_neon, arm_sme, arm_sve, bufferization, builtin, cf, check, chlo, complex, emitc, flow, func, gpu, hal, hal_inline, hal_loader, io_parameters, iree_codegen, iree_cpu, iree_encoding, iree_gpu, iree_linalg_ext, iree_tensor_ext, iree_vector_ext, linalg, llvm, math, memref, ml_program, nvgpu, nvvm, pcf, pdl, pdl_interp, quant, rocdl, scf, shape, shard, spirv, stablehlo, stream, tensor, tm_tensor, torch, torch_c, tosa, transform, ub, util, vector, vhlo, vm, vmvx ; for more info on dialect registration see https://mlir.llvm.org/getting_started/Faq/#registered-loaded-dependent-whats-up-with-dialects-management
+experiments/gemmini/iree_inputs/mini_cnn_block.gemmini_tile.mlir:62:5: note: Available dialects: affine, amdgpu, arith, arm_neon, arm_sme, arm_sve, bufferization, builtin, cf, check, chlo, complex, emitc, flow, func, gpu, hal, hal_inline, hal_loader, io_parameters, iree_codegen, iree_cpu, iree_encoding, iree_gpu, iree_linalg_ext, iree_tensor_ext, iree_vector_ext, linalg, llvm, math, memref, ml_program, nvgpu, nvvm, pcf, pdl, pdl_interp, quant, rocdl, scf, shape, shard, spirv, stablehlo, stream, tensor, tm_tensor, torch, torch_c, tosa, transform, ub, util, vector, vhlo, vm, vmvx ; for more info on dialect registration see https://mlir.llvm.org/getting_started/Faq/#registered-loaded-dependent-whats-up-with-dialects-management

From f2b44f76594654d08931c6a470e2e96ca85e22cf Mon Sep 17 00:00:00 2001
From: sparsh <sparshsingh@berkeley.edu>
Date: Fri, 16 Jan 2026 20:47:19 -0800
Subject: [PATCH 13/13] Gemmini: add demo IR dump log for conv2d_block1

---
 .../conv2d_block1.print-after-all.demo.mlir   | 693 ++++++++++++++++++
 1 file changed, 693 insertions(+)
 create mode 100644 experiments/gemmini/logs/conv2d_block1.print-after-all.demo.mlir

diff --git a/experiments/gemmini/logs/conv2d_block1.print-after-all.demo.mlir b/experiments/gemmini/logs/conv2d_block1.print-after-all.demo.mlir
new file mode 100644
index 0000000..e22f289
--- /dev/null
+++ b/experiments/gemmini/logs/conv2d_block1.print-after-all.demo.mlir
@@ -0,0 +1,693 @@
+// -----// IR Dump After (anonymous namespace)::LowerLinalgToGemminiPass (convert-linalg-to-gemmini) //----- //
+module {
+  func.func @conv2d_block1(%arg0: memref<1x32x32x32xf16>, %arg1: memref<3x3x32x64xf16>, %arg2: memref<1x30x30x64xf32>) {
+    %alloc = memref.alloc() : memref<288x64xf16>
+    %alloc_0 = memref.alloc() : memref<900x64xf32>
+    %alloc_1 = memref.alloc() : memref<64xi32>
+    %c0_i32 = arith.constant 0 : i32
+    linalg.fill ins(%c0_i32 : i32) outs(%alloc_1 : memref<64xi32>)
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c3 = arith.constant 3 : index
+    scf.for %arg3 = %c0 to %c3 step %c1 {
+      %c3_3 = arith.constant 3 : index
+      scf.for %arg4 = %c0 to %c3_3 step %c1 {
+        %c32 = arith.constant 32 : index
+        scf.for %arg5 = %c0 to %c32 step %c1 {
+          %c64 = arith.constant 64 : index
+          scf.for %arg6 = %c0 to %c64 step %c1 {
+            %c3_4 = arith.constant 3 : index
+            %c32_5 = arith.constant 32 : index
+            %0 = arith.muli %arg3, %c3_4 : index
+            %1 = arith.muli %0, %c32_5 : index
+            %2 = arith.muli %arg4, %c32_5 : index
+            %3 = arith.addi %1, %2 : index
+            %4 = arith.addi %3, %arg5 : index
+            %5 = memref.load %arg1[%arg3, %arg4, %arg5, %arg6] : memref<3x3x32x64xf16>
+            memref.store %5, %alloc[%4, %arg6] : memref<288x64xf16>
+          }
+        }
+      }
+    }
+    %c30_i64 = arith.constant 30 : i64
+    %c3_i64 = arith.constant 3 : i64
+    gemmini.tile_conv %arg0 %alloc %alloc_1 %alloc_0 %c30_i64 %c30_i64 %c3_i64 : memref<1x32x32x32xf16> memref<288x64xf16> memref<64xi32> memref<900x64xf32> i64 i64 i64
+    %c1_2 = arith.constant 1 : index
+    scf.for %arg3 = %c0 to %c1_2 step %c1 {
+      %c30 = arith.constant 30 : index
+      scf.for %arg4 = %c0 to %c30 step %c1 {
+        %c30_3 = arith.constant 30 : index
+        scf.for %arg5 = %c0 to %c30_3 step %c1 {
+          %c64 = arith.constant 64 : index
+          scf.for %arg6 = %c0 to %c64 step %c1 {
+            %c30_4 = arith.constant 30 : index
+            %0 = arith.muli %arg3, %c30_4 : index
+            %1 = arith.muli %0, %c30_4 : index
+            %2 = arith.muli %c30_4, %arg4 : index
+            %3 = arith.addi %1, %2 : index
+            %4 = arith.addi %3, %arg5 : index
+            %5 = memref.load %alloc_0[%4, %arg6] : memref<900x64xf32>
+            memref.store %5, %arg2[%arg3, %arg4, %arg5, %arg6] : memref<1x30x30x64xf32>
+          }
+        }
+      }
+    }
+    memref.dealloc %alloc : memref<288x64xf16>
+    memref.dealloc %alloc_0 : memref<900x64xf32>
+    memref.dealloc %alloc_1 : memref<64xi32>
+    return
+  }
+}
+
+
+// -----// IR Dump After (anonymous namespace)::LowerGemminiToLLVMPass (lower-gemmini) //----- //
+module {
+  llvm.func @free(!llvm.ptr)
+  llvm.func @malloc(i64) -> !llvm.ptr
+  llvm.func @conv2d_block1(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr, %arg23: !llvm.ptr, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64) {
+    %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
+    %1 = llvm.insertvalue %arg22, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %2 = llvm.insertvalue %arg23, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %3 = llvm.insertvalue %arg24, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %4 = llvm.insertvalue %arg25, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %5 = llvm.insertvalue %arg29, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %6 = llvm.insertvalue %arg26, %5[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %7 = llvm.insertvalue %arg30, %6[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %8 = llvm.insertvalue %arg27, %7[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %9 = llvm.insertvalue %arg31, %8[4, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %10 = llvm.insertvalue %arg28, %9[3, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %11 = llvm.insertvalue %arg32, %10[4, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %12 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
+    %13 = llvm.insertvalue %arg0, %12[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %14 = llvm.insertvalue %arg1, %13[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %15 = llvm.insertvalue %arg2, %14[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %16 = llvm.insertvalue %arg3, %15[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %17 = llvm.insertvalue %arg7, %16[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %18 = llvm.insertvalue %arg4, %17[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %19 = llvm.insertvalue %arg8, %18[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %20 = llvm.insertvalue %arg5, %19[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %21 = llvm.insertvalue %arg9, %20[4, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %22 = llvm.insertvalue %arg6, %21[3, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %23 = llvm.insertvalue %arg10, %22[4, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %24 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
+    %25 = llvm.insertvalue %arg11, %24[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %26 = llvm.insertvalue %arg12, %25[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %27 = llvm.insertvalue %arg13, %26[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %28 = llvm.insertvalue %arg14, %27[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %29 = llvm.insertvalue %arg18, %28[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %30 = llvm.insertvalue %arg15, %29[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %31 = llvm.insertvalue %arg19, %30[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %32 = llvm.insertvalue %arg16, %31[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %33 = llvm.insertvalue %arg20, %32[4, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %34 = llvm.insertvalue %arg17, %33[3, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %35 = llvm.insertvalue %arg21, %34[4, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %36 = llvm.mlir.constant(288 : index) : i64
+    %37 = llvm.mlir.constant(64 : index) : i64
+    %38 = llvm.mlir.constant(1 : index) : i64
+    %39 = llvm.mlir.constant(18432 : index) : i64
+    %40 = llvm.mlir.zero : !llvm.ptr
+    %41 = llvm.getelementptr %40[%39] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+    %42 = llvm.ptrtoint %41 : !llvm.ptr to i64
+    %43 = llvm.call @malloc(%42) : (i64) -> !llvm.ptr
+    %44 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+    %45 = llvm.insertvalue %43, %44[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %46 = llvm.insertvalue %43, %45[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %47 = llvm.mlir.constant(0 : index) : i64
+    %48 = llvm.insertvalue %47, %46[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %49 = llvm.insertvalue %36, %48[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %50 = llvm.insertvalue %37, %49[3, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %51 = llvm.insertvalue %37, %50[4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %52 = llvm.insertvalue %38, %51[4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %53 = llvm.mlir.constant(900 : index) : i64
+    %54 = llvm.mlir.constant(64 : index) : i64
+    %55 = llvm.mlir.constant(1 : index) : i64
+    %56 = llvm.mlir.constant(57600 : index) : i64
+    %57 = llvm.mlir.zero : !llvm.ptr
+    %58 = llvm.getelementptr %57[%56] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+    %59 = llvm.ptrtoint %58 : !llvm.ptr to i64
+    %60 = llvm.call @malloc(%59) : (i64) -> !llvm.ptr
+    %61 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+    %62 = llvm.insertvalue %60, %61[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %63 = llvm.insertvalue %60, %62[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %64 = llvm.mlir.constant(0 : index) : i64
+    %65 = llvm.insertvalue %64, %63[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %66 = llvm.insertvalue %53, %65[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %67 = llvm.insertvalue %54, %66[3, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %68 = llvm.insertvalue %54, %67[4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %69 = llvm.insertvalue %55, %68[4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %70 = llvm.mlir.constant(64 : index) : i64
+    %71 = llvm.mlir.constant(1 : index) : i64
+    %72 = llvm.mlir.zero : !llvm.ptr
+    %73 = llvm.getelementptr %72[%70] : (!llvm.ptr, i64) -> !llvm.ptr, i32
+    %74 = llvm.ptrtoint %73 : !llvm.ptr to i64
+    %75 = llvm.call @malloc(%74) : (i64) -> !llvm.ptr
+    %76 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+    %77 = llvm.insertvalue %75, %76[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
+    %78 = llvm.insertvalue %75, %77[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
+    %79 = llvm.mlir.constant(0 : index) : i64
+    %80 = llvm.insertvalue %79, %78[2] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
+    %81 = llvm.insertvalue %70, %80[3, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
+    %82 = llvm.insertvalue %71, %81[4, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
+    %83 = builtin.unrealized_conversion_cast %82 : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> to memref<64xi32>
+    %84 = llvm.mlir.constant(0 : i32) : i32
+    linalg.fill ins(%84 : i32) outs(%83 : memref<64xi32>)
+    %85 = llvm.mlir.constant(0 : index) : i64
+    %86 = llvm.mlir.constant(1 : index) : i64
+    %87 = llvm.mlir.constant(3 : index) : i64
+    llvm.br ^bb1(%85 : i64)
+  ^bb1(%88: i64):  // 2 preds: ^bb0, ^bb11
+    %89 = llvm.icmp "slt" %88, %87 : i64
+    llvm.cond_br %89, ^bb2, ^bb12
+  ^bb2:  // pred: ^bb1
+    %90 = llvm.mlir.constant(3 : index) : i64
+    llvm.br ^bb3(%85 : i64)
+  ^bb3(%91: i64):  // 2 preds: ^bb2, ^bb10
+    %92 = llvm.icmp "slt" %91, %90 : i64
+    llvm.cond_br %92, ^bb4, ^bb11
+  ^bb4:  // pred: ^bb3
+    %93 = llvm.mlir.constant(32 : index) : i64
+    llvm.br ^bb5(%85 : i64)
+  ^bb5(%94: i64):  // 2 preds: ^bb4, ^bb9
+    %95 = llvm.icmp "slt" %94, %93 : i64
+    llvm.cond_br %95, ^bb6, ^bb10
+  ^bb6:  // pred: ^bb5
+    %96 = llvm.mlir.constant(64 : index) : i64
+    llvm.br ^bb7(%85 : i64)
+  ^bb7(%97: i64):  // 2 preds: ^bb6, ^bb8
+    %98 = llvm.icmp "slt" %97, %96 : i64
+    llvm.cond_br %98, ^bb8, ^bb9
+  ^bb8:  // pred: ^bb7
+    %99 = llvm.mlir.constant(3 : index) : i64
+    %100 = llvm.mlir.constant(32 : index) : i64
+    %101 = llvm.mul %88, %99 : i64
+    %102 = llvm.mul %101, %100 : i64
+    %103 = llvm.mul %91, %100 : i64
+    %104 = llvm.add %102, %103 : i64
+    %105 = llvm.add %104, %94 : i64
+    %106 = llvm.extractvalue %35[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %107 = llvm.mlir.constant(6144 : index) : i64
+    %108 = llvm.mul %88, %107 : i64
+    %109 = llvm.mlir.constant(2048 : index) : i64
+    %110 = llvm.mul %91, %109 : i64
+    %111 = llvm.add %108, %110 : i64
+    %112 = llvm.mlir.constant(64 : index) : i64
+    %113 = llvm.mul %94, %112 : i64
+    %114 = llvm.add %111, %113 : i64
+    %115 = llvm.add %114, %97 : i64
+    %116 = llvm.getelementptr %106[%115] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+    %117 = llvm.load %116 : !llvm.ptr -> f16
+    %118 = llvm.extractvalue %52[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %119 = llvm.mlir.constant(64 : index) : i64
+    %120 = llvm.mul %105, %119 : i64
+    %121 = llvm.add %120, %97 : i64
+    %122 = llvm.getelementptr %118[%121] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+    llvm.store %117, %122 : f16, !llvm.ptr
+    %123 = llvm.add %97, %86 : i64
+    llvm.br ^bb7(%123 : i64)
+  ^bb9:  // pred: ^bb7
+    %124 = llvm.add %94, %86 : i64
+    llvm.br ^bb5(%124 : i64)
+  ^bb10:  // pred: ^bb5
+    %125 = llvm.add %91, %86 : i64
+    llvm.br ^bb3(%125 : i64)
+  ^bb11:  // pred: ^bb3
+    %126 = llvm.add %88, %86 : i64
+    llvm.br ^bb1(%126 : i64)
+  ^bb12:  // pred: ^bb1
+    %127 = llvm.mlir.constant(30 : i64) : i64
+    %128 = llvm.mlir.constant(3 : i64) : i64
+    %129 = llvm.extractvalue %23[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %130 = llvm.ptrtoint %129 : !llvm.ptr to i64
+    %131 = llvm.extractvalue %69[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %132 = llvm.ptrtoint %131 : !llvm.ptr to i64
+    %133 = llvm.extractvalue %82[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
+    %134 = llvm.ptrtoint %133 : !llvm.ptr to i64
+    %135 = llvm.extractvalue %52[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %136 = llvm.ptrtoint %135 : !llvm.ptr to i64
+    %137 = llvm.mlir.constant(64 : i64) : i64
+    %138 = llvm.mlir.constant(2 : i64) : i64
+    %139 = llvm.mlir.constant(4575657221408424000 : i64) : i64
+    "gemmini.intr.config_st"(%138, %139) : (i64, i64) -> ()
+    %140 = llvm.mlir.constant(65540 : i64) : i64
+    %141 = llvm.mlir.constant(281474976710656 : i64) : i64
+    "gemmini.intr.config_ex"(%140, %141) : (i64, i64) -> ()
+    %142 = llvm.mlir.constant(0 : i64) : i64
+    %143 = llvm.mlir.constant(0 : i64) : i64
+    %144 = llvm.mlir.constant(0 : i64) : i64
+    %145 = llvm.mlir.constant(0 : i64) : i64
+    %146 = llvm.mlir.constant(18014535950532609 : i64) : i64
+    %147 = llvm.mlir.constant(4296933406 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config1"(%146, %147) : (i64, i64) -> ()
+    %148 = llvm.mlir.constant(844429225164800 : i64) : i64
+    %149 = llvm.mlir.constant(281569467498512 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config2"(%148, %149) : (i64, i64) -> ()
+    %150 = llvm.mlir.constant(844437817131008 : i64) : i64
+    %151 = llvm.mlir.constant(0 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config3"(%150, %151) : (i64, i64) -> ()
+    %152 = llvm.mlir.constant(6192449487634432 : i64) : i64
+    %153 = llvm.mlir.constant(65559 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config4"(%152, %153) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config5"(%136, %132) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config6"(%134, %130) : (i64, i64) -> ()
+    %154 = llvm.mlir.constant(256 : i64) : i64
+    %155 = llvm.mlir.constant(1 : i64) : i64
+    "gemmini.intr.loop_conv_ws"(%154, %155) : (i64, i64) -> ()
+    %156 = llvm.mlir.constant(16 : i64) : i64
+    %157 = llvm.add %132, %156 : i64
+    %158 = llvm.mlir.constant(64 : i64) : i64
+    %159 = llvm.add %134, %158 : i64
+    %160 = llvm.mlir.constant(16 : i64) : i64
+    %161 = llvm.add %136, %160 : i64
+    %162 = llvm.mlir.constant(0 : i64) : i64
+    %163 = llvm.mlir.constant(18014535950532609 : i64) : i64
+    %164 = llvm.mlir.constant(4296933406 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config1"(%163, %164) : (i64, i64) -> ()
+    %165 = llvm.mlir.constant(844429225164800 : i64) : i64
+    %166 = llvm.mlir.constant(281569467498512 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config2"(%165, %166) : (i64, i64) -> ()
+    %167 = llvm.mlir.constant(844437817131008 : i64) : i64
+    %168 = llvm.mlir.constant(0 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config3"(%167, %168) : (i64, i64) -> ()
+    %169 = llvm.mlir.constant(6192449487634432 : i64) : i64
+    %170 = llvm.mlir.constant(65559 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config4"(%169, %170) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config5"(%161, %157) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config6"(%159, %130) : (i64, i64) -> ()
+    %171 = llvm.mlir.constant(256 : i64) : i64
+    %172 = llvm.mlir.constant(1 : i64) : i64
+    "gemmini.intr.loop_conv_ws"(%171, %172) : (i64, i64) -> ()
+    %173 = llvm.mlir.constant(32 : i64) : i64
+    %174 = llvm.add %132, %173 : i64
+    %175 = llvm.mlir.constant(128 : i64) : i64
+    %176 = llvm.add %134, %175 : i64
+    %177 = llvm.mlir.constant(32 : i64) : i64
+    %178 = llvm.add %136, %177 : i64
+    %179 = llvm.mlir.constant(0 : i64) : i64
+    %180 = llvm.mlir.constant(18014535950532609 : i64) : i64
+    %181 = llvm.mlir.constant(4296933406 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config1"(%180, %181) : (i64, i64) -> ()
+    %182 = llvm.mlir.constant(844429225164800 : i64) : i64
+    %183 = llvm.mlir.constant(281569467498512 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config2"(%182, %183) : (i64, i64) -> ()
+    %184 = llvm.mlir.constant(844437817131008 : i64) : i64
+    %185 = llvm.mlir.constant(0 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config3"(%184, %185) : (i64, i64) -> ()
+    %186 = llvm.mlir.constant(6192449487634432 : i64) : i64
+    %187 = llvm.mlir.constant(65559 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config4"(%186, %187) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config5"(%178, %174) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config6"(%176, %130) : (i64, i64) -> ()
+    %188 = llvm.mlir.constant(256 : i64) : i64
+    %189 = llvm.mlir.constant(1 : i64) : i64
+    "gemmini.intr.loop_conv_ws"(%188, %189) : (i64, i64) -> ()
+    %190 = llvm.mlir.constant(48 : i64) : i64
+    %191 = llvm.add %132, %190 : i64
+    %192 = llvm.mlir.constant(192 : i64) : i64
+    %193 = llvm.add %134, %192 : i64
+    %194 = llvm.mlir.constant(48 : i64) : i64
+    %195 = llvm.add %136, %194 : i64
+    %196 = llvm.mlir.constant(0 : i64) : i64
+    %197 = llvm.mlir.constant(18014535950532609 : i64) : i64
+    %198 = llvm.mlir.constant(4296933406 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config1"(%197, %198) : (i64, i64) -> ()
+    %199 = llvm.mlir.constant(844429225164800 : i64) : i64
+    %200 = llvm.mlir.constant(281569467498512 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config2"(%199, %200) : (i64, i64) -> ()
+    %201 = llvm.mlir.constant(844437817131008 : i64) : i64
+    %202 = llvm.mlir.constant(0 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config3"(%201, %202) : (i64, i64) -> ()
+    %203 = llvm.mlir.constant(6192449487634432 : i64) : i64
+    %204 = llvm.mlir.constant(65559 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config4"(%203, %204) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config5"(%195, %191) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config6"(%193, %130) : (i64, i64) -> ()
+    %205 = llvm.mlir.constant(256 : i64) : i64
+    %206 = llvm.mlir.constant(1 : i64) : i64
+    "gemmini.intr.loop_conv_ws"(%205, %206) : (i64, i64) -> ()
+    %207 = llvm.mlir.constant(1472 : i64) : i64
+    %208 = llvm.add %132, %207 : i64
+    %209 = llvm.mlir.constant(0 : i64) : i64
+    %210 = llvm.mlir.constant(0 : i64) : i64
+    %211 = llvm.mlir.constant(736 : i64) : i64
+    %212 = llvm.add %130, %211 : i64
+    %213 = llvm.mlir.constant(18014535950532609 : i64) : i64
+    %214 = llvm.mlir.constant(4296933406 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config1"(%213, %214) : (i64, i64) -> ()
+    %215 = llvm.mlir.constant(844429225164800 : i64) : i64
+    %216 = llvm.mlir.constant(281569466449936 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config2"(%215, %216) : (i64, i64) -> ()
+    %217 = llvm.mlir.constant(844437817131008 : i64) : i64
+    %218 = llvm.mlir.constant(0 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config3"(%217, %218) : (i64, i64) -> ()
+    %219 = llvm.mlir.constant(6192449487634432 : i64) : i64
+    %220 = llvm.mlir.constant(65543 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config4"(%219, %220) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config5"(%136, %208) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config6"(%134, %212) : (i64, i64) -> ()
+    %221 = llvm.mlir.constant(256 : i64) : i64
+    %222 = llvm.mlir.constant(1 : i64) : i64
+    "gemmini.intr.loop_conv_ws"(%221, %222) : (i64, i64) -> ()
+    %223 = llvm.mlir.constant(1488 : i64) : i64
+    %224 = llvm.add %132, %223 : i64
+    %225 = llvm.mlir.constant(64 : i64) : i64
+    %226 = llvm.add %134, %225 : i64
+    %227 = llvm.mlir.constant(16 : i64) : i64
+    %228 = llvm.add %136, %227 : i64
+    %229 = llvm.mlir.constant(736 : i64) : i64
+    %230 = llvm.add %130, %229 : i64
+    %231 = llvm.mlir.constant(18014535950532609 : i64) : i64
+    %232 = llvm.mlir.constant(4296933406 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config1"(%231, %232) : (i64, i64) -> ()
+    %233 = llvm.mlir.constant(844429225164800 : i64) : i64
+    %234 = llvm.mlir.constant(281569466449936 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config2"(%233, %234) : (i64, i64) -> ()
+    %235 = llvm.mlir.constant(844437817131008 : i64) : i64
+    %236 = llvm.mlir.constant(0 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config3"(%235, %236) : (i64, i64) -> ()
+    %237 = llvm.mlir.constant(6192449487634432 : i64) : i64
+    %238 = llvm.mlir.constant(65543 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config4"(%237, %238) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config5"(%228, %224) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config6"(%226, %230) : (i64, i64) -> ()
+    %239 = llvm.mlir.constant(256 : i64) : i64
+    %240 = llvm.mlir.constant(1 : i64) : i64
+    "gemmini.intr.loop_conv_ws"(%239, %240) : (i64, i64) -> ()
+    %241 = llvm.mlir.constant(1504 : i64) : i64
+    %242 = llvm.add %132, %241 : i64
+    %243 = llvm.mlir.constant(128 : i64) : i64
+    %244 = llvm.add %134, %243 : i64
+    %245 = llvm.mlir.constant(32 : i64) : i64
+    %246 = llvm.add %136, %245 : i64
+    %247 = llvm.mlir.constant(736 : i64) : i64
+    %248 = llvm.add %130, %247 : i64
+    %249 = llvm.mlir.constant(18014535950532609 : i64) : i64
+    %250 = llvm.mlir.constant(4296933406 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config1"(%249, %250) : (i64, i64) -> ()
+    %251 = llvm.mlir.constant(844429225164800 : i64) : i64
+    %252 = llvm.mlir.constant(281569466449936 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config2"(%251, %252) : (i64, i64) -> ()
+    %253 = llvm.mlir.constant(844437817131008 : i64) : i64
+    %254 = llvm.mlir.constant(0 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config3"(%253, %254) : (i64, i64) -> ()
+    %255 = llvm.mlir.constant(6192449487634432 : i64) : i64
+    %256 = llvm.mlir.constant(65543 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config4"(%255, %256) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config5"(%246, %242) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config6"(%244, %248) : (i64, i64) -> ()
+    %257 = llvm.mlir.constant(256 : i64) : i64
+    %258 = llvm.mlir.constant(1 : i64) : i64
+    "gemmini.intr.loop_conv_ws"(%257, %258) : (i64, i64) -> ()
+    %259 = llvm.mlir.constant(1520 : i64) : i64
+    %260 = llvm.add %132, %259 : i64
+    %261 = llvm.mlir.constant(192 : i64) : i64
+    %262 = llvm.add %134, %261 : i64
+    %263 = llvm.mlir.constant(48 : i64) : i64
+    %264 = llvm.add %136, %263 : i64
+    %265 = llvm.mlir.constant(736 : i64) : i64
+    %266 = llvm.add %130, %265 : i64
+    %267 = llvm.mlir.constant(18014535950532609 : i64) : i64
+    %268 = llvm.mlir.constant(4296933406 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config1"(%267, %268) : (i64, i64) -> ()
+    %269 = llvm.mlir.constant(844429225164800 : i64) : i64
+    %270 = llvm.mlir.constant(281569466449936 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config2"(%269, %270) : (i64, i64) -> ()
+    %271 = llvm.mlir.constant(844437817131008 : i64) : i64
+    %272 = llvm.mlir.constant(0 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config3"(%271, %272) : (i64, i64) -> ()
+    %273 = llvm.mlir.constant(6192449487634432 : i64) : i64
+    %274 = llvm.mlir.constant(65543 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config4"(%273, %274) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config5"(%264, %260) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config6"(%262, %266) : (i64, i64) -> ()
+    %275 = llvm.mlir.constant(256 : i64) : i64
+    %276 = llvm.mlir.constant(1 : i64) : i64
+    "gemmini.intr.loop_conv_ws"(%275, %276) : (i64, i64) -> ()
+    %277 = llvm.mlir.constant(42240 : i64) : i64
+    %278 = llvm.add %132, %277 : i64
+    %279 = llvm.mlir.constant(0 : i64) : i64
+    %280 = llvm.mlir.constant(0 : i64) : i64
+    %281 = llvm.mlir.constant(22528 : i64) : i64
+    %282 = llvm.add %130, %281 : i64
+    %283 = llvm.mlir.constant(18014535950532609 : i64) : i64
+    %284 = llvm.mlir.constant(4296933406 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config1"(%283, %284) : (i64, i64) -> ()
+    %285 = llvm.mlir.constant(844429225164800 : i64) : i64
+    %286 = llvm.mlir.constant(281509337956368 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config2"(%285, %286) : (i64, i64) -> ()
+    %287 = llvm.mlir.constant(844437817131008 : i64) : i64
+    %288 = llvm.mlir.constant(0 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config3"(%287, %288) : (i64, i64) -> ()
+    %289 = llvm.mlir.constant(2251799813685248 : i64) : i64
+    %290 = llvm.mlir.constant(65559 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config4"(%289, %290) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config5"(%136, %278) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config6"(%134, %282) : (i64, i64) -> ()
+    %291 = llvm.mlir.constant(256 : i64) : i64
+    %292 = llvm.mlir.constant(1 : i64) : i64
+    "gemmini.intr.loop_conv_ws"(%291, %292) : (i64, i64) -> ()
+    %293 = llvm.mlir.constant(42256 : i64) : i64
+    %294 = llvm.add %132, %293 : i64
+    %295 = llvm.mlir.constant(64 : i64) : i64
+    %296 = llvm.add %134, %295 : i64
+    %297 = llvm.mlir.constant(16 : i64) : i64
+    %298 = llvm.add %136, %297 : i64
+    %299 = llvm.mlir.constant(22528 : i64) : i64
+    %300 = llvm.add %130, %299 : i64
+    %301 = llvm.mlir.constant(18014535950532609 : i64) : i64
+    %302 = llvm.mlir.constant(4296933406 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config1"(%301, %302) : (i64, i64) -> ()
+    %303 = llvm.mlir.constant(844429225164800 : i64) : i64
+    %304 = llvm.mlir.constant(281509337956368 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config2"(%303, %304) : (i64, i64) -> ()
+    %305 = llvm.mlir.constant(844437817131008 : i64) : i64
+    %306 = llvm.mlir.constant(0 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config3"(%305, %306) : (i64, i64) -> ()
+    %307 = llvm.mlir.constant(2251799813685248 : i64) : i64
+    %308 = llvm.mlir.constant(65559 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config4"(%307, %308) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config5"(%298, %294) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config6"(%296, %300) : (i64, i64) -> ()
+    %309 = llvm.mlir.constant(256 : i64) : i64
+    %310 = llvm.mlir.constant(1 : i64) : i64
+    "gemmini.intr.loop_conv_ws"(%309, %310) : (i64, i64) -> ()
+    %311 = llvm.mlir.constant(42272 : i64) : i64
+    %312 = llvm.add %132, %311 : i64
+    %313 = llvm.mlir.constant(128 : i64) : i64
+    %314 = llvm.add %134, %313 : i64
+    %315 = llvm.mlir.constant(32 : i64) : i64
+    %316 = llvm.add %136, %315 : i64
+    %317 = llvm.mlir.constant(22528 : i64) : i64
+    %318 = llvm.add %130, %317 : i64
+    %319 = llvm.mlir.constant(18014535950532609 : i64) : i64
+    %320 = llvm.mlir.constant(4296933406 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config1"(%319, %320) : (i64, i64) -> ()
+    %321 = llvm.mlir.constant(844429225164800 : i64) : i64
+    %322 = llvm.mlir.constant(281509337956368 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config2"(%321, %322) : (i64, i64) -> ()
+    %323 = llvm.mlir.constant(844437817131008 : i64) : i64
+    %324 = llvm.mlir.constant(0 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config3"(%323, %324) : (i64, i64) -> ()
+    %325 = llvm.mlir.constant(2251799813685248 : i64) : i64
+    %326 = llvm.mlir.constant(65559 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config4"(%325, %326) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config5"(%316, %312) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config6"(%314, %318) : (i64, i64) -> ()
+    %327 = llvm.mlir.constant(256 : i64) : i64
+    %328 = llvm.mlir.constant(1 : i64) : i64
+    "gemmini.intr.loop_conv_ws"(%327, %328) : (i64, i64) -> ()
+    %329 = llvm.mlir.constant(42288 : i64) : i64
+    %330 = llvm.add %132, %329 : i64
+    %331 = llvm.mlir.constant(192 : i64) : i64
+    %332 = llvm.add %134, %331 : i64
+    %333 = llvm.mlir.constant(48 : i64) : i64
+    %334 = llvm.add %136, %333 : i64
+    %335 = llvm.mlir.constant(22528 : i64) : i64
+    %336 = llvm.add %130, %335 : i64
+    %337 = llvm.mlir.constant(18014535950532609 : i64) : i64
+    %338 = llvm.mlir.constant(4296933406 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config1"(%337, %338) : (i64, i64) -> ()
+    %339 = llvm.mlir.constant(844429225164800 : i64) : i64
+    %340 = llvm.mlir.constant(281509337956368 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config2"(%339, %340) : (i64, i64) -> ()
+    %341 = llvm.mlir.constant(844437817131008 : i64) : i64
+    %342 = llvm.mlir.constant(0 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config3"(%341, %342) : (i64, i64) -> ()
+    %343 = llvm.mlir.constant(2251799813685248 : i64) : i64
+    %344 = llvm.mlir.constant(65559 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config4"(%343, %344) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config5"(%334, %330) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config6"(%332, %336) : (i64, i64) -> ()
+    %345 = llvm.mlir.constant(256 : i64) : i64
+    %346 = llvm.mlir.constant(1 : i64) : i64
+    "gemmini.intr.loop_conv_ws"(%345, %346) : (i64, i64) -> ()
+    %347 = llvm.mlir.constant(43712 : i64) : i64
+    %348 = llvm.add %132, %347 : i64
+    %349 = llvm.mlir.constant(0 : i64) : i64
+    %350 = llvm.mlir.constant(0 : i64) : i64
+    %351 = llvm.mlir.constant(23264 : i64) : i64
+    %352 = llvm.add %130, %351 : i64
+    %353 = llvm.mlir.constant(18014535950532609 : i64) : i64
+    %354 = llvm.mlir.constant(4296933406 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config1"(%353, %354) : (i64, i64) -> ()
+    %355 = llvm.mlir.constant(844429225164800 : i64) : i64
+    %356 = llvm.mlir.constant(281509336907792 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config2"(%355, %356) : (i64, i64) -> ()
+    %357 = llvm.mlir.constant(844437817131008 : i64) : i64
+    %358 = llvm.mlir.constant(0 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config3"(%357, %358) : (i64, i64) -> ()
+    %359 = llvm.mlir.constant(2251799813685248 : i64) : i64
+    %360 = llvm.mlir.constant(65543 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config4"(%359, %360) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config5"(%136, %348) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config6"(%134, %352) : (i64, i64) -> ()
+    %361 = llvm.mlir.constant(256 : i64) : i64
+    %362 = llvm.mlir.constant(1 : i64) : i64
+    "gemmini.intr.loop_conv_ws"(%361, %362) : (i64, i64) -> ()
+    %363 = llvm.mlir.constant(43728 : i64) : i64
+    %364 = llvm.add %132, %363 : i64
+    %365 = llvm.mlir.constant(64 : i64) : i64
+    %366 = llvm.add %134, %365 : i64
+    %367 = llvm.mlir.constant(16 : i64) : i64
+    %368 = llvm.add %136, %367 : i64
+    %369 = llvm.mlir.constant(23264 : i64) : i64
+    %370 = llvm.add %130, %369 : i64
+    %371 = llvm.mlir.constant(18014535950532609 : i64) : i64
+    %372 = llvm.mlir.constant(4296933406 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config1"(%371, %372) : (i64, i64) -> ()
+    %373 = llvm.mlir.constant(844429225164800 : i64) : i64
+    %374 = llvm.mlir.constant(281509336907792 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config2"(%373, %374) : (i64, i64) -> ()
+    %375 = llvm.mlir.constant(844437817131008 : i64) : i64
+    %376 = llvm.mlir.constant(0 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config3"(%375, %376) : (i64, i64) -> ()
+    %377 = llvm.mlir.constant(2251799813685248 : i64) : i64
+    %378 = llvm.mlir.constant(65543 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config4"(%377, %378) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config5"(%368, %364) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config6"(%366, %370) : (i64, i64) -> ()
+    %379 = llvm.mlir.constant(256 : i64) : i64
+    %380 = llvm.mlir.constant(1 : i64) : i64
+    "gemmini.intr.loop_conv_ws"(%379, %380) : (i64, i64) -> ()
+    %381 = llvm.mlir.constant(43744 : i64) : i64
+    %382 = llvm.add %132, %381 : i64
+    %383 = llvm.mlir.constant(128 : i64) : i64
+    %384 = llvm.add %134, %383 : i64
+    %385 = llvm.mlir.constant(32 : i64) : i64
+    %386 = llvm.add %136, %385 : i64
+    %387 = llvm.mlir.constant(23264 : i64) : i64
+    %388 = llvm.add %130, %387 : i64
+    %389 = llvm.mlir.constant(18014535950532609 : i64) : i64
+    %390 = llvm.mlir.constant(4296933406 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config1"(%389, %390) : (i64, i64) -> ()
+    %391 = llvm.mlir.constant(844429225164800 : i64) : i64
+    %392 = llvm.mlir.constant(281509336907792 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config2"(%391, %392) : (i64, i64) -> ()
+    %393 = llvm.mlir.constant(844437817131008 : i64) : i64
+    %394 = llvm.mlir.constant(0 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config3"(%393, %394) : (i64, i64) -> ()
+    %395 = llvm.mlir.constant(2251799813685248 : i64) : i64
+    %396 = llvm.mlir.constant(65543 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config4"(%395, %396) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config5"(%386, %382) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config6"(%384, %388) : (i64, i64) -> ()
+    %397 = llvm.mlir.constant(256 : i64) : i64
+    %398 = llvm.mlir.constant(1 : i64) : i64
+    "gemmini.intr.loop_conv_ws"(%397, %398) : (i64, i64) -> ()
+    %399 = llvm.mlir.constant(43760 : i64) : i64
+    %400 = llvm.add %132, %399 : i64
+    %401 = llvm.mlir.constant(192 : i64) : i64
+    %402 = llvm.add %134, %401 : i64
+    %403 = llvm.mlir.constant(48 : i64) : i64
+    %404 = llvm.add %136, %403 : i64
+    %405 = llvm.mlir.constant(23264 : i64) : i64
+    %406 = llvm.add %130, %405 : i64
+    %407 = llvm.mlir.constant(18014535950532609 : i64) : i64
+    %408 = llvm.mlir.constant(4296933406 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config1"(%407, %408) : (i64, i64) -> ()
+    %409 = llvm.mlir.constant(844429225164800 : i64) : i64
+    %410 = llvm.mlir.constant(281509336907792 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config2"(%409, %410) : (i64, i64) -> ()
+    %411 = llvm.mlir.constant(844437817131008 : i64) : i64
+    %412 = llvm.mlir.constant(0 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config3"(%411, %412) : (i64, i64) -> ()
+    %413 = llvm.mlir.constant(2251799813685248 : i64) : i64
+    %414 = llvm.mlir.constant(65543 : i64) : i64
+    "gemmini.intr.loop_conv_ws_config4"(%413, %414) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config5"(%404, %400) : (i64, i64) -> ()
+    "gemmini.intr.loop_conv_ws_config6"(%402, %406) : (i64, i64) -> ()
+    %415 = llvm.mlir.constant(256 : i64) : i64
+    %416 = llvm.mlir.constant(1 : i64) : i64
+    "gemmini.intr.loop_conv_ws"(%415, %416) : (i64, i64) -> ()
+    %417 = llvm.mlir.constant(0 : i64) : i64
+    "gemmini.intr.flush"(%417, %417) : (i64, i64) -> ()
+    %418 = llvm.mlir.constant(1 : index) : i64
+    llvm.br ^bb13(%85 : i64)
+  ^bb13(%419: i64):  // 2 preds: ^bb12, ^bb23
+    %420 = llvm.icmp "slt" %419, %418 : i64
+    llvm.cond_br %420, ^bb14, ^bb24
+  ^bb14:  // pred: ^bb13
+    %421 = llvm.mlir.constant(30 : index) : i64
+    llvm.br ^bb15(%85 : i64)
+  ^bb15(%422: i64):  // 2 preds: ^bb14, ^bb22
+    %423 = llvm.icmp "slt" %422, %421 : i64
+    llvm.cond_br %423, ^bb16, ^bb23
+  ^bb16:  // pred: ^bb15
+    %424 = llvm.mlir.constant(30 : index) : i64
+    llvm.br ^bb17(%85 : i64)
+  ^bb17(%425: i64):  // 2 preds: ^bb16, ^bb21
+    %426 = llvm.icmp "slt" %425, %424 : i64
+    llvm.cond_br %426, ^bb18, ^bb22
+  ^bb18:  // pred: ^bb17
+    %427 = llvm.mlir.constant(64 : index) : i64
+    llvm.br ^bb19(%85 : i64)
+  ^bb19(%428: i64):  // 2 preds: ^bb18, ^bb20
+    %429 = llvm.icmp "slt" %428, %427 : i64
+    llvm.cond_br %429, ^bb20, ^bb21
+  ^bb20:  // pred: ^bb19
+    %430 = llvm.mlir.constant(30 : index) : i64
+    %431 = llvm.mul %419, %430 : i64
+    %432 = llvm.mul %431, %430 : i64
+    %433 = llvm.mul %422, %430 : i64
+    %434 = llvm.add %432, %433 : i64
+    %435 = llvm.add %434, %425 : i64
+    %436 = llvm.extractvalue %69[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    %437 = llvm.mlir.constant(64 : index) : i64
+    %438 = llvm.mul %435, %437 : i64
+    %439 = llvm.add %438, %428 : i64
+    %440 = llvm.getelementptr %436[%439] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+    %441 = llvm.load %440 : !llvm.ptr -> f32
+    %442 = llvm.extractvalue %11[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+    %443 = llvm.mlir.constant(57600 : index) : i64
+    %444 = llvm.mul %419, %443 : i64
+    %445 = llvm.mlir.constant(1920 : index) : i64
+    %446 = llvm.mul %422, %445 : i64
+    %447 = llvm.add %444, %446 : i64
+    %448 = llvm.mlir.constant(64 : index) : i64
+    %449 = llvm.mul %425, %448 : i64
+    %450 = llvm.add %447, %449 : i64
+    %451 = llvm.add %450, %428 : i64
+    %452 = llvm.getelementptr %442[%451] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+    llvm.store %441, %452 : f32, !llvm.ptr
+    %453 = llvm.add %428, %86 : i64
+    llvm.br ^bb19(%453 : i64)
+  ^bb21:  // pred: ^bb19
+    %454 = llvm.add %425, %86 : i64
+    llvm.br ^bb17(%454 : i64)
+  ^bb22:  // pred: ^bb17
+    %455 = llvm.add %422, %86 : i64
+    llvm.br ^bb15(%455 : i64)
+  ^bb23:  // pred: ^bb15
+    %456 = llvm.add %419, %86 : i64
+    llvm.br ^bb13(%456 : i64)
+  ^bb24:  // pred: ^bb13
+    %457 = llvm.extractvalue %52[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    llvm.call @free(%457) : (!llvm.ptr) -> ()
+    %458 = llvm.extractvalue %69[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+    llvm.call @free(%458) : (!llvm.ptr) -> ()
+    %459 = llvm.extractvalue %82[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
+    llvm.call @free(%459) : (!llvm.ptr) -> ()
+    llvm.return
+  }
+}
+
+