From 15daa59274bd6cb276b05d0aaf68ff215291dedf Mon Sep 17 00:00:00 2001 From: sparsh Date: Tue, 9 Dec 2025 01:33:05 -0800 Subject: [PATCH 01/13] Buddy Gemmini: IR dumps for matmul (and more configs) --- experiments/gemmini/inputs/matmul.mlir | 7 ++ .../gemmini/logs/matmul.print-after-all.mlir | 113 ++++++++++++++++++ 2 files changed, 120 insertions(+) create mode 100644 experiments/gemmini/inputs/matmul.mlir create mode 100644 experiments/gemmini/logs/matmul.print-after-all.mlir diff --git a/experiments/gemmini/inputs/matmul.mlir b/experiments/gemmini/inputs/matmul.mlir new file mode 100644 index 0000000..f6fb3b4 --- /dev/null +++ b/experiments/gemmini/inputs/matmul.mlir @@ -0,0 +1,7 @@ +module { + func.func @matmul(%A: memref<64x64xf16>, %B: memref<64x64xf16>, %C: memref<64x64xf32>) { + linalg.matmul ins(%A, %B : memref<64x64xf16>, memref<64x64xf16>) + outs(%C : memref<64x64xf32>) + return + } +} diff --git a/experiments/gemmini/logs/matmul.print-after-all.mlir b/experiments/gemmini/logs/matmul.print-after-all.mlir new file mode 100644 index 0000000..6e9a5cc --- /dev/null +++ b/experiments/gemmini/logs/matmul.print-after-all.mlir @@ -0,0 +1,113 @@ +// -----// IR Dump After (anonymous namespace)::LowerLinalgToGemminiPass (convert-linalg-to-gemmini) //----- // +module { + func.func @matmul(%arg0: memref<64x64xf16>, %arg1: memref<64x64xf16>, %arg2: memref<64x64xf32>) { + %alloc = memref.alloc() : memref<64x64xi32> + %c0_i32 = arith.constant 0 : i32 + linalg.fill ins(%c0_i32 : i32) outs(%alloc : memref<64x64xi32>) + gemmini.tile_matmul %arg0 %arg1 %arg2 %alloc : memref<64x64xf16> memref<64x64xf16> memref<64x64xf32> memref<64x64xi32> + memref.dealloc %alloc : memref<64x64xi32> + return + } +} + + +// -----// IR Dump After (anonymous namespace)::LowerGemminiToLLVMPass (lower-gemmini) //----- // +module { + llvm.func @free(!llvm.ptr) + llvm.func @malloc(i64) -> !llvm.ptr + llvm.func @matmul(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: !llvm.ptr, %arg8: !llvm.ptr, %arg9: i64, %arg10: i64, %arg11: i64, %arg12: i64, %arg13: i64, %arg14: !llvm.ptr, %arg15: !llvm.ptr, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64) { + %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %1 = llvm.insertvalue %arg14, %0[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %2 = llvm.insertvalue %arg15, %1[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %3 = llvm.insertvalue %arg16, %2[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %4 = llvm.insertvalue %arg17, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %5 = llvm.insertvalue %arg19, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %6 = llvm.insertvalue %arg18, %5[3, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %7 = llvm.insertvalue %arg20, %6[4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %8 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %9 = llvm.insertvalue %arg7, %8[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %10 = llvm.insertvalue %arg8, %9[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %11 = llvm.insertvalue %arg9, %10[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %12 = llvm.insertvalue %arg10, %11[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %13 = llvm.insertvalue %arg12, %12[4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %14 = llvm.insertvalue %arg11, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %15 = llvm.insertvalue %arg13, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %16 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %17 = llvm.insertvalue %arg0, %16[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %18 = llvm.insertvalue %arg1, %17[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %19 = llvm.insertvalue %arg2, %18[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %20 = llvm.insertvalue %arg3, %19[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %21 = llvm.insertvalue %arg5, %20[4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %22 = llvm.insertvalue %arg4, %21[3, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %23 = llvm.insertvalue %arg6, %22[4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %24 = llvm.mlir.constant(64 : index) : i64 + %25 = llvm.mlir.constant(64 : index) : i64 + %26 = llvm.mlir.constant(1 : index) : i64 + %27 = llvm.mlir.constant(4096 : index) : i64 + %28 = llvm.mlir.zero : !llvm.ptr + %29 = llvm.getelementptr %28[%27] : (!llvm.ptr, i64) -> !llvm.ptr, i32 + %30 = llvm.ptrtoint %29 : !llvm.ptr to i64 + %31 = llvm.call @malloc(%30) : (i64) -> !llvm.ptr + %32 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %33 = llvm.insertvalue %31, %32[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %34 = llvm.insertvalue %31, %33[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %35 = llvm.mlir.constant(0 : index) : i64 + %36 = llvm.insertvalue %35, %34[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %37 = llvm.insertvalue %24, %36[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %38 = llvm.insertvalue %25, %37[3, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %39 = llvm.insertvalue %25, %38[4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %40 = llvm.insertvalue %26, %39[4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %41 = builtin.unrealized_conversion_cast %40 : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> to memref<64x64xi32> + %42 = llvm.mlir.constant(0 : i32) : i32 + linalg.fill ins(%42 : i32) outs(%41 : memref<64x64xi32>) + %43 = llvm.extractvalue %23[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %44 = llvm.ptrtoint %43 : !llvm.ptr to i64 + %45 = llvm.extractvalue %15[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %46 = llvm.ptrtoint %45 : !llvm.ptr to i64 + %47 = llvm.extractvalue %7[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %48 = llvm.ptrtoint %47 : !llvm.ptr to i64 + %49 = llvm.extractvalue %40[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %50 = llvm.ptrtoint %49 : !llvm.ptr to i64 + %51 = llvm.mlir.constant(4575657221408489476 : i64) : i64 + %52 = llvm.mlir.constant(281474976710656 : i64) : i64 + "gemmini.intr.config_ex"(%51, %52) : (i64, i64) -> () + %53 = llvm.mlir.constant(64 : i64) : i64 + %54 = llvm.mlir.constant(2 : i64) : i64 + %55 = llvm.mlir.constant(4575657221408424000 : i64) : i64 + "gemmini.intr.config_st"(%54, %55) : (i64, i64) -> () + %56 = llvm.mlir.constant(64 : i64) : i64 + %57 = llvm.mlir.constant(4575657221409472769 : i64) : i64 + "gemmini.intr.config_ld"(%57, %56) : (i64, i64) -> () + %58 = llvm.mlir.constant(64 : i64) : i64 + %59 = llvm.mlir.constant(4575657221409472777 : i64) : i64 + "gemmini.intr.config_ld"(%59, %58) : (i64, i64) -> () + %60 = llvm.mlir.constant(256 : i64) : i64 + %61 = llvm.mlir.constant(4575657221409472785 : i64) : i64 + "gemmini.intr.config_ld"(%61, %60) : (i64, i64) -> () + %62 = llvm.mlir.constant(0 : i64) : i64 + %63 = llvm.mlir.constant(0 : i64) : i64 + %64 = llvm.mlir.constant(0 : i64) : i64 + %65 = llvm.mlir.constant(0 : i64) : i64 + %66 = llvm.mlir.constant(0 : i64) : i64 + %67 = llvm.mlir.constant(17180131332 : i64) : i64 + "gemmini.intr.loop_ws_config_bounds"(%66, %67) : (i64, i64) -> () + "gemmini.intr.loop_ws_config_addrs_ab"(%44, %46) : (i64, i64) -> () + "gemmini.intr.loop_ws_config_addrs_dc"(%50, %48) : (i64, i64) -> () + %68 = llvm.mlir.constant(64 : i64) : i64 + %69 = llvm.mlir.constant(64 : i64) : i64 + "gemmini.intr.loop_ws_config_strides_ab"(%68, %69) : (i64, i64) -> () + %70 = llvm.mlir.constant(64 : i64) : i64 + %71 = llvm.mlir.constant(64 : i64) : i64 + "gemmini.intr.loop_ws_config_strides_dc"(%70, %71) : (i64, i64) -> () + %72 = llvm.mlir.constant(1 : i64) : i64 + %73 = llvm.mlir.constant(0 : i64) : i64 + "gemmini.intr.loop_ws"(%72, %73) : (i64, i64) -> () + %74 = llvm.mlir.constant(0 : i64) : i64 + "gemmini.intr.flush"(%74, %74) : (i64, i64) -> () + %75 = llvm.extractvalue %40[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + llvm.call @free(%75) : (!llvm.ptr) -> () + llvm.return + } +} + + From 5fef6bad9afabf4cec78f7d388982322c42daefc Mon Sep 17 00:00:00 2001 From: sparsh Date: Tue, 9 Dec 2025 01:38:35 -0800 Subject: [PATCH 02/13] Buddy Gemmini: add batch_matmul + conv IR dumps --- experiments/gemmini/inputs/batch_matmul.mlir | 30 + .../gemmini/inputs/conv_2d_nchw_fchw_f32.mlir | 51 ++ .../batch_matmul.mlir.print-after-all.mlir | 411 ++++++++++++ ...2d_nchw_fchw_f32.mlir.print-after-all.mlir | 594 ++++++++++++++++++ 4 files changed, 1086 insertions(+) create mode 100644 experiments/gemmini/inputs/batch_matmul.mlir create mode 100644 experiments/gemmini/inputs/conv_2d_nchw_fchw_f32.mlir create mode 100644 experiments/gemmini/logs/batch_matmul.mlir.print-after-all.mlir create mode 100644 experiments/gemmini/logs/conv_2d_nchw_fchw_f32.mlir.print-after-all.mlir diff --git a/experiments/gemmini/inputs/batch_matmul.mlir b/experiments/gemmini/inputs/batch_matmul.mlir new file mode 100644 index 0000000..1cf5347 --- /dev/null +++ b/experiments/gemmini/inputs/batch_matmul.mlir @@ -0,0 +1,30 @@ +// RUN: buddy-opt %s \ +// RUN: --convert-linalg-to-gemmini | \ +// RUN: FileCheck %s + +func.func @main() -> i8 { + %0 = arith.constant 0 : i8 + %1 = arith.constant 1 : i8 + %2 = arith.constant 2 : i8 + %input0 = memref.alloc() : memref<3x3x3xi8> + %input1 = memref.alloc() : memref<3x3x3xi8> + %output = memref.alloc() : memref<3x3x3xi8> + linalg.fill + ins(%1 : i8) + outs(%input0 : memref<3x3x3xi8>) + linalg.fill + ins(%2 : i8) + outs(%input1 : memref<3x3x3xi8>) + // CHECK: gemmini.tile_matmul %subview %subview_2 %subview_3 %alloc_4 : + // CHECK-SAME: memref<3x3xi8, strided<[3, 1]>> memref<3x3xi8, strided<[3, 1]>> memref<3x3xi8, strided<[3, 1]>> memref<3x3xi32> + // CHECK: gemmini.tile_matmul %subview_5 %subview_6 %subview_7 %alloc_8 : + // CHECK-SAME: memref<3x3xi8, strided<[3, 1], offset: 9>> memref<3x3xi8, strided<[3, 1], offset: 9>> memref<3x3xi8, strided<[3, 1], offset: 9>> memref<3x3xi32> + // CHECK: gemmini.tile_matmul %subview_10 %subview_11 %subview_12 %alloc_13 : + // CHECK-SAME: memref<3x3xi8, strided<[3, 1], offset: 18>> memref<3x3xi8, strided<[3, 1], offset: 18>> memref<3x3xi8, strided<[3, 1], offset: 18>> memref<3x3xi32> + linalg.batch_matmul + ins(%input0, %input1: memref<3x3x3xi8>, memref<3x3x3xi8>) + outs(%output : memref<3x3x3xi8>) + gemmini.print %output : memref<3x3x3xi8> + memref.dealloc %output : memref<3x3x3xi8> + return %0 : i8 +} diff --git a/experiments/gemmini/inputs/conv_2d_nchw_fchw_f32.mlir b/experiments/gemmini/inputs/conv_2d_nchw_fchw_f32.mlir new file mode 100644 index 0000000..1091167 --- /dev/null +++ b/experiments/gemmini/inputs/conv_2d_nchw_fchw_f32.mlir @@ -0,0 +1,51 @@ +// RUN: buddy-opt %s \ +// RUN: --convert-linalg-to-gemmini="acc_t=f32" | \ +// RUN: FileCheck %s + +memref.global "private" @input : memref<2x2x5x5xf32> = dense<[[[[1., 0., -1., 0., 1.], + [1., 0., -1., 0., 1.], + [1., 0., -1., 0., 1.], + [1., 0., -1., 0., 1.], + [-1., 0., 1., 0., -1.]], + [[-1., 0., 1., 0., -1.], + [-1., 0., 1., 0., -1.], + [-1., 0., 1., 0., -1.], + [-1., 0., 1., 0., -1.], + [-1., 0., 1., 0., -1.]]], + [[[1., 0., 2., 0., 1.], + [1., 0., 2., 0., 1.], + [1., 0., 2., 0., 1.], + [1., 0., 2., 0., 1.], + [-1., 0., 2., 0., -1.]], + [[-1., 0., 2., 0., -1.], + [-1., 0., 2., 0., -1.], + [-1., 0., 2., 0., -1.], + [-1., 0., 2., 0., -1.], + [-1., 0., 2., 0., -1.]]]]> + +memref.global "private" @weight : memref<2x2x3x3xf32> = dense<[[[[1., 2., 3.], + [3., 2., 1.], + [1., 2., 3.]], + [[3., 2., 1.], + [1., 2., 3.], + [3., 2., 1.]]], + [[[1., 2., 3.], + [3., 2., 1.], + [1., 2., 3.]], + [[3., 2., 1.], + [1., 2., 3.], + [3., 2., 1.]]]]> + +func.func @main() -> i8 { + %0 = arith.constant 0 : i8 + %mem0 = memref.get_global @input : memref<2x2x5x5xf32> + %mem1 = memref.get_global @weight : memref<2x2x3x3xf32> + %mem2 = memref.alloc() : memref<2x2x3x3xf32> + // CHECK: gemmini.tile_conv %alloc_{{[0-9]+}} %alloc_{{[0-9]+}} %alloc_{{[0-9]+}} %alloc_{{[0-9]+}} %{{.+}} %{{.+}} : + // CHECK-SAME: memref<2x5x5x2xf32> memref<18x2xf32> memref<2xf32> memref<18x2xf32> i64 i64 + linalg.conv_2d_nchw_fchw + ins (%mem0, %mem1 : memref<2x2x5x5xf32>, memref<2x2x3x3xf32>) + outs(%mem2 : memref<2x2x3x3xf32>) + gemmini.print %mem2 : memref<2x2x3x3xf32> + return %0 : i8 +} diff --git a/experiments/gemmini/logs/batch_matmul.mlir.print-after-all.mlir b/experiments/gemmini/logs/batch_matmul.mlir.print-after-all.mlir new file mode 100644 index 0000000..8f6ed5e --- /dev/null +++ b/experiments/gemmini/logs/batch_matmul.mlir.print-after-all.mlir @@ -0,0 +1,411 @@ +// -----// IR Dump After (anonymous namespace)::LowerLinalgToGemminiPass (convert-linalg-to-gemmini) //----- // +module { + func.func @main() -> i8 { + %c0_i8 = arith.constant 0 : i8 + %c1_i8 = arith.constant 1 : i8 + %c2_i8 = arith.constant 2 : i8 + %alloc = memref.alloc() : memref<3x3x3xi8> + %alloc_0 = memref.alloc() : memref<3x3x3xi8> + %alloc_1 = memref.alloc() : memref<3x3x3xi8> + linalg.fill ins(%c1_i8 : i8) outs(%alloc : memref<3x3x3xi8>) + linalg.fill ins(%c2_i8 : i8) outs(%alloc_0 : memref<3x3x3xi8>) + %subview = memref.subview %alloc[0, 0, 0] [1, 3, 3] [1, 1, 1] : memref<3x3x3xi8> to memref<3x3xi8, strided<[3, 1]>> + %subview_2 = memref.subview %alloc_0[0, 0, 0] [1, 3, 3] [1, 1, 1] : memref<3x3x3xi8> to memref<3x3xi8, strided<[3, 1]>> + %subview_3 = memref.subview %alloc_1[0, 0, 0] [1, 3, 3] [1, 1, 1] : memref<3x3x3xi8> to memref<3x3xi8, strided<[3, 1]>> + %alloc_4 = memref.alloc() : memref<3x3xi32> + %c0_i32 = arith.constant 0 : i32 + linalg.fill ins(%c0_i32 : i32) outs(%alloc_4 : memref<3x3xi32>) + gemmini.tile_matmul %subview %subview_2 %subview_3 %alloc_4 : memref<3x3xi8, strided<[3, 1]>> memref<3x3xi8, strided<[3, 1]>> memref<3x3xi8, strided<[3, 1]>> memref<3x3xi32> + memref.dealloc %alloc_4 : memref<3x3xi32> + %subview_5 = memref.subview %alloc[1, 0, 0] [1, 3, 3] [1, 1, 1] : memref<3x3x3xi8> to memref<3x3xi8, strided<[3, 1], offset: 9>> + %subview_6 = memref.subview %alloc_0[1, 0, 0] [1, 3, 3] [1, 1, 1] : memref<3x3x3xi8> to memref<3x3xi8, strided<[3, 1], offset: 9>> + %subview_7 = memref.subview %alloc_1[1, 0, 0] [1, 3, 3] [1, 1, 1] : memref<3x3x3xi8> to memref<3x3xi8, strided<[3, 1], offset: 9>> + %alloc_8 = memref.alloc() : memref<3x3xi32> + %c0_i32_9 = arith.constant 0 : i32 + linalg.fill ins(%c0_i32_9 : i32) outs(%alloc_8 : memref<3x3xi32>) + gemmini.tile_matmul %subview_5 %subview_6 %subview_7 %alloc_8 : memref<3x3xi8, strided<[3, 1], offset: 9>> memref<3x3xi8, strided<[3, 1], offset: 9>> memref<3x3xi8, strided<[3, 1], offset: 9>> memref<3x3xi32> + memref.dealloc %alloc_8 : memref<3x3xi32> + %subview_10 = memref.subview %alloc[2, 0, 0] [1, 3, 3] [1, 1, 1] : memref<3x3x3xi8> to memref<3x3xi8, strided<[3, 1], offset: 18>> + %subview_11 = memref.subview %alloc_0[2, 0, 0] [1, 3, 3] [1, 1, 1] : memref<3x3x3xi8> to memref<3x3xi8, strided<[3, 1], offset: 18>> + %subview_12 = memref.subview %alloc_1[2, 0, 0] [1, 3, 3] [1, 1, 1] : memref<3x3x3xi8> to memref<3x3xi8, strided<[3, 1], offset: 18>> + %alloc_13 = memref.alloc() : memref<3x3xi32> + %c0_i32_14 = arith.constant 0 : i32 + linalg.fill ins(%c0_i32_14 : i32) outs(%alloc_13 : memref<3x3xi32>) + gemmini.tile_matmul %subview_10 %subview_11 %subview_12 %alloc_13 : memref<3x3xi8, strided<[3, 1], offset: 18>> memref<3x3xi8, strided<[3, 1], offset: 18>> memref<3x3xi8, strided<[3, 1], offset: 18>> memref<3x3xi32> + memref.dealloc %alloc_13 : memref<3x3xi32> + gemmini.print %alloc_1 : memref<3x3x3xi8> + memref.dealloc %alloc_1 : memref<3x3x3xi8> + return %c0_i8 : i8 + } +} + + +// -----// IR Dump After (anonymous namespace)::LowerGemminiToLLVMPass (lower-gemmini) //----- // +module { + llvm.mlir.global internal constant @nl("\0A\00") {addr_space = 0 : i32} + llvm.mlir.global internal constant @frmt_spec("%d \00") {addr_space = 0 : i32} + llvm.func @printf(!llvm.ptr, ...) -> i32 + llvm.func @free(!llvm.ptr) + llvm.func @malloc(i64) -> !llvm.ptr + llvm.func @main() -> i8 { + %0 = llvm.mlir.constant(0 : i8) : i8 + %1 = llvm.mlir.constant(1 : i8) : i8 + %2 = llvm.mlir.constant(2 : i8) : i8 + %3 = llvm.mlir.constant(3 : index) : i64 + %4 = llvm.mlir.constant(3 : index) : i64 + %5 = llvm.mlir.constant(3 : index) : i64 + %6 = llvm.mlir.constant(1 : index) : i64 + %7 = llvm.mlir.constant(9 : index) : i64 + %8 = llvm.mlir.constant(27 : index) : i64 + %9 = llvm.mlir.zero : !llvm.ptr + %10 = llvm.getelementptr %9[%8] : (!llvm.ptr, i64) -> !llvm.ptr, i8 + %11 = llvm.ptrtoint %10 : !llvm.ptr to i64 + %12 = llvm.call @malloc(%11) : (i64) -> !llvm.ptr + %13 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %14 = llvm.insertvalue %12, %13[0] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %15 = llvm.insertvalue %12, %14[1] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %16 = llvm.mlir.constant(0 : index) : i64 + %17 = llvm.insertvalue %16, %15[2] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %18 = llvm.insertvalue %3, %17[3, 0] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %19 = llvm.insertvalue %4, %18[3, 1] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %20 = llvm.insertvalue %5, %19[3, 2] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %21 = llvm.insertvalue %7, %20[4, 0] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %22 = llvm.insertvalue %5, %21[4, 1] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %23 = llvm.insertvalue %6, %22[4, 2] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %24 = builtin.unrealized_conversion_cast %23 : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> to memref<3x3x3xi8> + %25 = llvm.mlir.constant(3 : index) : i64 + %26 = llvm.mlir.constant(3 : index) : i64 + %27 = llvm.mlir.constant(3 : index) : i64 + %28 = llvm.mlir.constant(1 : index) : i64 + %29 = llvm.mlir.constant(9 : index) : i64 + %30 = llvm.mlir.constant(27 : index) : i64 + %31 = llvm.mlir.zero : !llvm.ptr + %32 = llvm.getelementptr %31[%30] : (!llvm.ptr, i64) -> !llvm.ptr, i8 + %33 = llvm.ptrtoint %32 : !llvm.ptr to i64 + %34 = llvm.call @malloc(%33) : (i64) -> !llvm.ptr + %35 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %36 = llvm.insertvalue %34, %35[0] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %37 = llvm.insertvalue %34, %36[1] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %38 = llvm.mlir.constant(0 : index) : i64 + %39 = llvm.insertvalue %38, %37[2] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %40 = llvm.insertvalue %25, %39[3, 0] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %41 = llvm.insertvalue %26, %40[3, 1] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %42 = llvm.insertvalue %27, %41[3, 2] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %43 = llvm.insertvalue %29, %42[4, 0] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %44 = llvm.insertvalue %27, %43[4, 1] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %45 = llvm.insertvalue %28, %44[4, 2] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %46 = builtin.unrealized_conversion_cast %45 : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> to memref<3x3x3xi8> + %47 = llvm.mlir.constant(3 : index) : i64 + %48 = llvm.mlir.constant(3 : index) : i64 + %49 = llvm.mlir.constant(3 : index) : i64 + %50 = llvm.mlir.constant(1 : index) : i64 + %51 = llvm.mlir.constant(9 : index) : i64 + %52 = llvm.mlir.constant(27 : index) : i64 + %53 = llvm.mlir.zero : !llvm.ptr + %54 = llvm.getelementptr %53[%52] : (!llvm.ptr, i64) -> !llvm.ptr, i8 + %55 = llvm.ptrtoint %54 : !llvm.ptr to i64 + %56 = llvm.call @malloc(%55) : (i64) -> !llvm.ptr + %57 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %58 = llvm.insertvalue %56, %57[0] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %59 = llvm.insertvalue %56, %58[1] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %60 = llvm.mlir.constant(0 : index) : i64 + %61 = llvm.insertvalue %60, %59[2] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %62 = llvm.insertvalue %47, %61[3, 0] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %63 = llvm.insertvalue %48, %62[3, 1] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %64 = llvm.insertvalue %49, %63[3, 2] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %65 = llvm.insertvalue %51, %64[4, 0] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %66 = llvm.insertvalue %49, %65[4, 1] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %67 = llvm.insertvalue %50, %66[4, 2] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %68 = builtin.unrealized_conversion_cast %67 : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> to memref<3x3x3xi8> + linalg.fill ins(%1 : i8) outs(%24 : memref<3x3x3xi8>) + linalg.fill ins(%2 : i8) outs(%46 : memref<3x3x3xi8>) + %subview = memref.subview %24[0, 0, 0] [1, 3, 3] [1, 1, 1] : memref<3x3x3xi8> to memref<3x3xi8, strided<[3, 1]>> + %69 = builtin.unrealized_conversion_cast %subview : memref<3x3xi8, strided<[3, 1]>> to !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %subview_0 = memref.subview %46[0, 0, 0] [1, 3, 3] [1, 1, 1] : memref<3x3x3xi8> to memref<3x3xi8, strided<[3, 1]>> + %70 = builtin.unrealized_conversion_cast %subview_0 : memref<3x3xi8, strided<[3, 1]>> to !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %subview_1 = memref.subview %68[0, 0, 0] [1, 3, 3] [1, 1, 1] : memref<3x3x3xi8> to memref<3x3xi8, strided<[3, 1]>> + %71 = builtin.unrealized_conversion_cast %subview_1 : memref<3x3xi8, strided<[3, 1]>> to !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %72 = llvm.mlir.constant(3 : index) : i64 + %73 = llvm.mlir.constant(3 : index) : i64 + %74 = llvm.mlir.constant(1 : index) : i64 + %75 = llvm.mlir.constant(9 : index) : i64 + %76 = llvm.mlir.zero : !llvm.ptr + %77 = llvm.getelementptr %76[%75] : (!llvm.ptr, i64) -> !llvm.ptr, i32 + %78 = llvm.ptrtoint %77 : !llvm.ptr to i64 + %79 = llvm.call @malloc(%78) : (i64) -> !llvm.ptr + %80 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %81 = llvm.insertvalue %79, %80[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %82 = llvm.insertvalue %79, %81[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %83 = llvm.mlir.constant(0 : index) : i64 + %84 = llvm.insertvalue %83, %82[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %85 = llvm.insertvalue %72, %84[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %86 = llvm.insertvalue %73, %85[3, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %87 = llvm.insertvalue %73, %86[4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %88 = llvm.insertvalue %74, %87[4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %89 = builtin.unrealized_conversion_cast %88 : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> to memref<3x3xi32> + %90 = llvm.mlir.constant(0 : i32) : i32 + linalg.fill ins(%90 : i32) outs(%89 : memref<3x3xi32>) + %91 = llvm.extractvalue %69[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %92 = llvm.ptrtoint %91 : !llvm.ptr to i64 + %93 = llvm.mlir.constant(0 : index) : i64 + %94 = llvm.extractvalue %70[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %95 = llvm.ptrtoint %94 : !llvm.ptr to i64 + %96 = llvm.mlir.constant(0 : index) : i64 + %97 = llvm.extractvalue %71[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %98 = llvm.ptrtoint %97 : !llvm.ptr to i64 + %99 = llvm.mlir.constant(0 : index) : i64 + %100 = llvm.extractvalue %88[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %101 = llvm.ptrtoint %100 : !llvm.ptr to i64 + %102 = llvm.mlir.constant(4575657221408489476 : i64) : i64 + %103 = llvm.mlir.constant(281474976710656 : i64) : i64 + "gemmini.intr.config_ex"(%102, %103) : (i64, i64) -> () + %104 = llvm.mlir.constant(3 : i64) : i64 + %105 = llvm.mlir.constant(2 : i64) : i64 + %106 = llvm.mlir.constant(4575657221408423939 : i64) : i64 + "gemmini.intr.config_st"(%105, %106) : (i64, i64) -> () + %107 = llvm.mlir.constant(3 : i64) : i64 + %108 = llvm.mlir.constant(4575657221409472769 : i64) : i64 + "gemmini.intr.config_ld"(%108, %107) : (i64, i64) -> () + %109 = llvm.mlir.constant(3 : i64) : i64 + %110 = llvm.mlir.constant(4575657221409472777 : i64) : i64 + "gemmini.intr.config_ld"(%110, %109) : (i64, i64) -> () + %111 = llvm.mlir.constant(12 : i64) : i64 + %112 = llvm.mlir.constant(4575657221409472785 : i64) : i64 + "gemmini.intr.config_ld"(%112, %111) : (i64, i64) -> () + %113 = llvm.mlir.constant(0 : i64) : i64 + %114 = llvm.mlir.constant(0 : i64) : i64 + %115 = llvm.mlir.constant(0 : i64) : i64 + %116 = llvm.mlir.constant(0 : i64) : i64 + %117 = llvm.mlir.constant(55835426829 : i64) : i64 + %118 = llvm.mlir.constant(4295032833 : i64) : i64 + "gemmini.intr.loop_ws_config_bounds"(%117, %118) : (i64, i64) -> () + "gemmini.intr.loop_ws_config_addrs_ab"(%92, %95) : (i64, i64) -> () + "gemmini.intr.loop_ws_config_addrs_dc"(%101, %98) : (i64, i64) -> () + %119 = llvm.mlir.constant(3 : i64) : i64 + %120 = llvm.mlir.constant(3 : i64) : i64 + "gemmini.intr.loop_ws_config_strides_ab"(%119, %120) : (i64, i64) -> () + %121 = llvm.mlir.constant(3 : i64) : i64 + %122 = llvm.mlir.constant(3 : i64) : i64 + "gemmini.intr.loop_ws_config_strides_dc"(%121, %122) : (i64, i64) -> () + %123 = llvm.mlir.constant(1 : i64) : i64 + %124 = llvm.mlir.constant(0 : i64) : i64 + "gemmini.intr.loop_ws"(%123, %124) : (i64, i64) -> () + %125 = llvm.mlir.constant(0 : i64) : i64 + "gemmini.intr.flush"(%125, %125) : (i64, i64) -> () + %126 = llvm.extractvalue %88[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + llvm.call @free(%126) : (!llvm.ptr) -> () + %subview_2 = memref.subview %24[1, 0, 0] [1, 3, 3] [1, 1, 1] : memref<3x3x3xi8> to memref<3x3xi8, strided<[3, 1], offset: 9>> + %127 = builtin.unrealized_conversion_cast %subview_2 : memref<3x3xi8, strided<[3, 1], offset: 9>> to !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %subview_3 = memref.subview %46[1, 0, 0] [1, 3, 3] [1, 1, 1] : memref<3x3x3xi8> to memref<3x3xi8, strided<[3, 1], offset: 9>> + %128 = builtin.unrealized_conversion_cast %subview_3 : memref<3x3xi8, strided<[3, 1], offset: 9>> to !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %subview_4 = memref.subview %68[1, 0, 0] [1, 3, 3] [1, 1, 1] : memref<3x3x3xi8> to memref<3x3xi8, strided<[3, 1], offset: 9>> + %129 = builtin.unrealized_conversion_cast %subview_4 : memref<3x3xi8, strided<[3, 1], offset: 9>> to !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %130 = llvm.mlir.constant(3 : index) : i64 + %131 = llvm.mlir.constant(3 : index) : i64 + %132 = llvm.mlir.constant(1 : index) : i64 + %133 = llvm.mlir.constant(9 : index) : i64 + %134 = llvm.mlir.zero : !llvm.ptr + %135 = llvm.getelementptr %134[%133] : (!llvm.ptr, i64) -> !llvm.ptr, i32 + %136 = llvm.ptrtoint %135 : !llvm.ptr to i64 + %137 = llvm.call @malloc(%136) : (i64) -> !llvm.ptr + %138 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %139 = llvm.insertvalue %137, %138[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %140 = llvm.insertvalue %137, %139[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %141 = llvm.mlir.constant(0 : index) : i64 + %142 = llvm.insertvalue %141, %140[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %143 = llvm.insertvalue %130, %142[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %144 = llvm.insertvalue %131, %143[3, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %145 = llvm.insertvalue %131, %144[4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %146 = llvm.insertvalue %132, %145[4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %147 = builtin.unrealized_conversion_cast %146 : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> to memref<3x3xi32> + %148 = llvm.mlir.constant(0 : i32) : i32 + linalg.fill ins(%148 : i32) outs(%147 : memref<3x3xi32>) + %149 = llvm.extractvalue %127[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %150 = llvm.ptrtoint %149 : !llvm.ptr to i64 + %151 = llvm.mlir.constant(9 : index) : i64 + %152 = llvm.add %150, %151 : i64 + %153 = llvm.extractvalue %128[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %154 = llvm.ptrtoint %153 : !llvm.ptr to i64 + %155 = llvm.mlir.constant(9 : index) : i64 + %156 = llvm.add %154, %155 : i64 + %157 = llvm.extractvalue %129[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %158 = llvm.ptrtoint %157 : !llvm.ptr to i64 + %159 = llvm.mlir.constant(9 : index) : i64 + %160 = llvm.add %158, %159 : i64 + %161 = llvm.extractvalue %146[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %162 = llvm.ptrtoint %161 : !llvm.ptr to i64 + %163 = llvm.mlir.constant(4575657221408489476 : i64) : i64 + %164 = llvm.mlir.constant(281474976710656 : i64) : i64 + "gemmini.intr.config_ex"(%163, %164) : (i64, i64) -> () + %165 = llvm.mlir.constant(3 : i64) : i64 + %166 = llvm.mlir.constant(2 : i64) : i64 + %167 = llvm.mlir.constant(4575657221408423939 : i64) : i64 + "gemmini.intr.config_st"(%166, %167) : (i64, i64) -> () + %168 = llvm.mlir.constant(3 : i64) : i64 + %169 = llvm.mlir.constant(4575657221409472769 : i64) : i64 + "gemmini.intr.config_ld"(%169, %168) : (i64, i64) -> () + %170 = llvm.mlir.constant(3 : i64) : i64 + %171 = llvm.mlir.constant(4575657221409472777 : i64) : i64 + "gemmini.intr.config_ld"(%171, %170) : (i64, i64) -> () + %172 = llvm.mlir.constant(12 : i64) : i64 + %173 = llvm.mlir.constant(4575657221409472785 : i64) : i64 + "gemmini.intr.config_ld"(%173, %172) : (i64, i64) -> () + %174 = llvm.mlir.constant(0 : i64) : i64 + %175 = llvm.mlir.constant(0 : i64) : i64 + %176 = llvm.mlir.constant(0 : i64) : i64 + %177 = llvm.mlir.constant(0 : i64) : i64 + %178 = llvm.mlir.constant(55835426829 : i64) : i64 + %179 = llvm.mlir.constant(4295032833 : i64) : i64 + "gemmini.intr.loop_ws_config_bounds"(%178, %179) : (i64, i64) -> () + "gemmini.intr.loop_ws_config_addrs_ab"(%152, %156) : (i64, i64) -> () + "gemmini.intr.loop_ws_config_addrs_dc"(%162, %160) : (i64, i64) -> () + %180 = llvm.mlir.constant(3 : i64) : i64 + %181 = llvm.mlir.constant(3 : i64) : i64 + "gemmini.intr.loop_ws_config_strides_ab"(%180, %181) : (i64, i64) -> () + %182 = llvm.mlir.constant(3 : i64) : i64 + %183 = llvm.mlir.constant(3 : i64) : i64 + "gemmini.intr.loop_ws_config_strides_dc"(%182, %183) : (i64, i64) -> () + %184 = llvm.mlir.constant(1 : i64) : i64 + %185 = llvm.mlir.constant(0 : i64) : i64 + "gemmini.intr.loop_ws"(%184, %185) : (i64, i64) -> () + %186 = llvm.mlir.constant(0 : i64) : i64 + "gemmini.intr.flush"(%186, %186) : (i64, i64) -> () + %187 = llvm.extractvalue %146[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + llvm.call @free(%187) : (!llvm.ptr) -> () + %subview_5 = memref.subview %24[2, 0, 0] [1, 3, 3] [1, 1, 1] : memref<3x3x3xi8> to memref<3x3xi8, strided<[3, 1], offset: 18>> + %188 = builtin.unrealized_conversion_cast %subview_5 : memref<3x3xi8, strided<[3, 1], offset: 18>> to !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %subview_6 = memref.subview %46[2, 0, 0] [1, 3, 3] [1, 1, 1] : memref<3x3x3xi8> to memref<3x3xi8, strided<[3, 1], offset: 18>> + %189 = builtin.unrealized_conversion_cast %subview_6 : memref<3x3xi8, strided<[3, 1], offset: 18>> to !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %subview_7 = memref.subview %68[2, 0, 0] [1, 3, 3] [1, 1, 1] : memref<3x3x3xi8> to memref<3x3xi8, strided<[3, 1], offset: 18>> + %190 = builtin.unrealized_conversion_cast %subview_7 : memref<3x3xi8, strided<[3, 1], offset: 18>> to !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %191 = llvm.mlir.constant(3 : index) : i64 + %192 = llvm.mlir.constant(3 : index) : i64 + %193 = llvm.mlir.constant(1 : index) : i64 + %194 = llvm.mlir.constant(9 : index) : i64 + %195 = llvm.mlir.zero : !llvm.ptr + %196 = llvm.getelementptr %195[%194] : (!llvm.ptr, i64) -> !llvm.ptr, i32 + %197 = llvm.ptrtoint %196 : !llvm.ptr to i64 + %198 = llvm.call @malloc(%197) : (i64) -> !llvm.ptr + %199 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %200 = llvm.insertvalue %198, %199[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %201 = llvm.insertvalue %198, %200[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %202 = llvm.mlir.constant(0 : index) : i64 + %203 = llvm.insertvalue %202, %201[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %204 = llvm.insertvalue %191, %203[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %205 = llvm.insertvalue %192, %204[3, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %206 = llvm.insertvalue %192, %205[4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %207 = llvm.insertvalue %193, %206[4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %208 = builtin.unrealized_conversion_cast %207 : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> to memref<3x3xi32> + %209 = llvm.mlir.constant(0 : i32) : i32 + linalg.fill ins(%209 : i32) outs(%208 : memref<3x3xi32>) + %210 = llvm.extractvalue %188[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %211 = llvm.ptrtoint %210 : !llvm.ptr to i64 + %212 = llvm.mlir.constant(18 : index) : i64 + %213 = llvm.add %211, %212 : i64 + %214 = llvm.extractvalue %189[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %215 = llvm.ptrtoint %214 : !llvm.ptr to i64 + %216 = llvm.mlir.constant(18 : index) : i64 + %217 = llvm.add %215, %216 : i64 + %218 = llvm.extractvalue %190[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %219 = llvm.ptrtoint %218 : !llvm.ptr to i64 + %220 = llvm.mlir.constant(18 : index) : i64 + %221 = llvm.add %219, %220 : i64 + %222 = llvm.extractvalue %207[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %223 = llvm.ptrtoint %222 : !llvm.ptr to i64 + %224 = llvm.mlir.constant(4575657221408489476 : i64) : i64 + %225 = llvm.mlir.constant(281474976710656 : i64) : i64 + "gemmini.intr.config_ex"(%224, %225) : (i64, i64) -> () + %226 = llvm.mlir.constant(3 : i64) : i64 + %227 = llvm.mlir.constant(2 : i64) : i64 + %228 = llvm.mlir.constant(4575657221408423939 : i64) : i64 + "gemmini.intr.config_st"(%227, %228) : (i64, i64) -> () + %229 = llvm.mlir.constant(3 : i64) : i64 + %230 = llvm.mlir.constant(4575657221409472769 : i64) : i64 + "gemmini.intr.config_ld"(%230, %229) : (i64, i64) -> () + %231 = llvm.mlir.constant(3 : i64) : i64 + %232 = llvm.mlir.constant(4575657221409472777 : i64) : i64 + "gemmini.intr.config_ld"(%232, %231) : (i64, i64) -> () + %233 = llvm.mlir.constant(12 : i64) : i64 + %234 = llvm.mlir.constant(4575657221409472785 : i64) : i64 + "gemmini.intr.config_ld"(%234, %233) : (i64, i64) -> () + %235 = llvm.mlir.constant(0 : i64) : i64 + %236 = llvm.mlir.constant(0 : i64) : i64 + %237 = llvm.mlir.constant(0 : i64) : i64 + %238 = llvm.mlir.constant(0 : i64) : i64 + %239 = llvm.mlir.constant(55835426829 : i64) : i64 + %240 = llvm.mlir.constant(4295032833 : i64) : i64 + "gemmini.intr.loop_ws_config_bounds"(%239, %240) : (i64, i64) -> () + "gemmini.intr.loop_ws_config_addrs_ab"(%213, %217) : (i64, i64) -> () + "gemmini.intr.loop_ws_config_addrs_dc"(%223, %221) : (i64, i64) -> () + %241 = llvm.mlir.constant(3 : i64) : i64 + %242 = llvm.mlir.constant(3 : i64) : i64 + "gemmini.intr.loop_ws_config_strides_ab"(%241, %242) : (i64, i64) -> () + %243 = llvm.mlir.constant(3 : i64) : i64 + %244 = llvm.mlir.constant(3 : i64) : i64 + "gemmini.intr.loop_ws_config_strides_dc"(%243, %244) : (i64, i64) -> () + %245 = llvm.mlir.constant(1 : i64) : i64 + %246 = llvm.mlir.constant(0 : i64) : i64 + "gemmini.intr.loop_ws"(%245, %246) : (i64, i64) -> () + %247 = llvm.mlir.constant(0 : i64) : i64 + "gemmini.intr.flush"(%247, %247) : (i64, i64) -> () + %248 = llvm.extractvalue %207[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + llvm.call @free(%248) : (!llvm.ptr) -> () + %249 = llvm.mlir.addressof @frmt_spec : !llvm.ptr + %250 = llvm.mlir.constant(0 : index) : i64 + %251 = llvm.getelementptr %249[%250, %250] : (!llvm.ptr, i64, i64) -> !llvm.ptr, !llvm.array<4 x i8> + %252 = llvm.mlir.addressof @nl : !llvm.ptr + %253 = llvm.mlir.constant(0 : index) : i64 + %254 = llvm.getelementptr %252[%253, %253] : (!llvm.ptr, i64, i64) -> !llvm.ptr, !llvm.array<2 x i8> + %255 = llvm.mlir.constant(0 : index) : i64 + %256 = llvm.mlir.constant(3 : index) : i64 + %257 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb1(%255 : i64) + ^bb1(%258: i64): // 2 preds: ^bb0, ^bb8 + %259 = llvm.icmp "slt" %258, %256 : i64 + llvm.cond_br %259, ^bb2, ^bb9 + ^bb2: // pred: ^bb1 + %260 = llvm.mlir.constant(0 : index) : i64 + %261 = llvm.mlir.constant(3 : index) : i64 + %262 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb3(%260 : i64) + ^bb3(%263: i64): // 2 preds: ^bb2, ^bb7 + %264 = llvm.icmp "slt" %263, %261 : i64 + llvm.cond_br %264, ^bb4, ^bb8 + ^bb4: // pred: ^bb3 + %265 = llvm.mlir.constant(0 : index) : i64 + %266 = llvm.mlir.constant(3 : index) : i64 + %267 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb5(%265 : i64) + ^bb5(%268: i64): // 2 preds: ^bb4, ^bb6 + %269 = llvm.icmp "slt" %268, %266 : i64 + llvm.cond_br %269, ^bb6, ^bb7 + ^bb6: // pred: ^bb5 + %270 = llvm.extractvalue %67[1] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %271 = llvm.mlir.constant(9 : index) : i64 + %272 = llvm.mul %258, %271 : i64 + %273 = llvm.mlir.constant(3 : index) : i64 + %274 = llvm.mul %263, %273 : i64 + %275 = llvm.add %272, %274 : i64 + %276 = llvm.add %275, %268 : i64 + %277 = llvm.getelementptr %270[%276] : (!llvm.ptr, i64) -> !llvm.ptr, i8 + %278 = llvm.load %277 : !llvm.ptr -> i8 + %279 = llvm.sext %278 : i8 to i32 + %280 = llvm.call @printf(%251, %279) vararg(!llvm.func) : (!llvm.ptr, i32) -> i32 + %281 = llvm.add %268, %267 : i64 + llvm.br ^bb5(%281 : i64) + ^bb7: // pred: ^bb5 + %282 = llvm.call @printf(%254) vararg(!llvm.func) : (!llvm.ptr) -> i32 + %283 = llvm.add %263, %262 : i64 + llvm.br ^bb3(%283 : i64) + ^bb8: // pred: ^bb3 + %284 = llvm.call @printf(%254) vararg(!llvm.func) : (!llvm.ptr) -> i32 + %285 = llvm.add %258, %257 : i64 + llvm.br ^bb1(%285 : i64) + ^bb9: // pred: ^bb1 + %286 = llvm.extractvalue %67[0] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + llvm.call @free(%286) : (!llvm.ptr) -> () + llvm.return %0 : i8 + } +} + + diff --git a/experiments/gemmini/logs/conv_2d_nchw_fchw_f32.mlir.print-after-all.mlir b/experiments/gemmini/logs/conv_2d_nchw_fchw_f32.mlir.print-after-all.mlir new file mode 100644 index 0000000..c372bc5 --- /dev/null +++ b/experiments/gemmini/logs/conv_2d_nchw_fchw_f32.mlir.print-after-all.mlir @@ -0,0 +1,594 @@ +// -----// IR Dump After (anonymous namespace)::LowerLinalgToGemminiPass (convert-linalg-to-gemmini) //----- // +module { + memref.global "private" @input : memref<2x2x5x5xf32> = dense<[[[[1.000000e+00, 0.000000e+00, -1.000000e+00, 0.000000e+00, 1.000000e+00], [1.000000e+00, 0.000000e+00, -1.000000e+00, 0.000000e+00, 1.000000e+00], [1.000000e+00, 0.000000e+00, -1.000000e+00, 0.000000e+00, 1.000000e+00], [1.000000e+00, 0.000000e+00, -1.000000e+00, 0.000000e+00, 1.000000e+00], [-1.000000e+00, 0.000000e+00, 1.000000e+00, 0.000000e+00, -1.000000e+00]], [[-1.000000e+00, 0.000000e+00, 1.000000e+00, 0.000000e+00, -1.000000e+00], [-1.000000e+00, 0.000000e+00, 1.000000e+00, 0.000000e+00, -1.000000e+00], [-1.000000e+00, 0.000000e+00, 1.000000e+00, 0.000000e+00, -1.000000e+00], [-1.000000e+00, 0.000000e+00, 1.000000e+00, 0.000000e+00, -1.000000e+00], [-1.000000e+00, 0.000000e+00, 1.000000e+00, 0.000000e+00, -1.000000e+00]]], [[[1.000000e+00, 0.000000e+00, 2.000000e+00, 0.000000e+00, 1.000000e+00], [1.000000e+00, 0.000000e+00, 2.000000e+00, 0.000000e+00, 1.000000e+00], [1.000000e+00, 0.000000e+00, 2.000000e+00, 0.000000e+00, 1.000000e+00], [1.000000e+00, 0.000000e+00, 2.000000e+00, 0.000000e+00, 1.000000e+00], [-1.000000e+00, 0.000000e+00, 2.000000e+00, 0.000000e+00, -1.000000e+00]], [[-1.000000e+00, 0.000000e+00, 2.000000e+00, 0.000000e+00, -1.000000e+00], [-1.000000e+00, 0.000000e+00, 2.000000e+00, 0.000000e+00, -1.000000e+00], [-1.000000e+00, 0.000000e+00, 2.000000e+00, 0.000000e+00, -1.000000e+00], [-1.000000e+00, 0.000000e+00, 2.000000e+00, 0.000000e+00, -1.000000e+00], [-1.000000e+00, 0.000000e+00, 2.000000e+00, 0.000000e+00, -1.000000e+00]]]]> + memref.global "private" @weight : memref<2x2x3x3xf32> = dense<[[[[1.000000e+00, 2.000000e+00, 3.000000e+00], [3.000000e+00, 2.000000e+00, 1.000000e+00], [1.000000e+00, 2.000000e+00, 3.000000e+00]], [[3.000000e+00, 2.000000e+00, 1.000000e+00], [1.000000e+00, 2.000000e+00, 3.000000e+00], [3.000000e+00, 2.000000e+00, 1.000000e+00]]], [[[1.000000e+00, 2.000000e+00, 3.000000e+00], [3.000000e+00, 2.000000e+00, 1.000000e+00], [1.000000e+00, 2.000000e+00, 3.000000e+00]], [[3.000000e+00, 2.000000e+00, 1.000000e+00], [1.000000e+00, 2.000000e+00, 3.000000e+00], [3.000000e+00, 2.000000e+00, 1.000000e+00]]]]> + func.func @main() -> i8 { + %c0_i8 = arith.constant 0 : i8 + %0 = memref.get_global @input : memref<2x2x5x5xf32> + %1 = memref.get_global @weight : memref<2x2x3x3xf32> + %alloc = memref.alloc() : memref<2x2x3x3xf32> + %alloc_0 = memref.alloc() : memref<2x5x5x2xf32> + %alloc_1 = memref.alloc() : memref<18x2xf32> + %alloc_2 = memref.alloc() : memref<2xi32> + %alloc_3 = memref.alloc() : memref<18x2xf32> + %c3_i64 = arith.constant 3 : i64 + %c3 = arith.constant 3 : index + %c2 = arith.constant 2 : index + %c0 = arith.constant 0 : index + %c2_4 = arith.constant 2 : index + %c1 = arith.constant 1 : index + scf.for %arg0 = %c0 to %c2_4 step %c1 { + %c0_12 = arith.constant 0 : index + %c2_13 = arith.constant 2 : index + %c1_14 = arith.constant 1 : index + scf.for %arg1 = %c0_12 to %c2_13 step %c1_14 { + %c0_15 = arith.constant 0 : index + %c5 = arith.constant 5 : index + %c1_16 = arith.constant 1 : index + scf.for %arg2 = %c0_15 to %c5 step %c1_16 { + %c0_17 = arith.constant 0 : index + %c5_18 = arith.constant 5 : index + %c1_19 = arith.constant 1 : index + scf.for %arg3 = %c0_17 to %c5_18 step %c1_19 { + %2 = memref.load %0[%arg0, %arg1, %arg2, %arg3] : memref<2x2x5x5xf32> + memref.store %2, %alloc_0[%arg0, %arg2, %arg3, %arg1] : memref<2x5x5x2xf32> + } + } + } + } + %c0_5 = arith.constant 0 : index + %c2_6 = arith.constant 2 : index + %c1_7 = arith.constant 1 : index + scf.for %arg0 = %c0_5 to %c2_6 step %c1_7 { + %c0_12 = arith.constant 0 : index + %c2_13 = arith.constant 2 : index + %c1_14 = arith.constant 1 : index + scf.for %arg1 = %c0_12 to %c2_13 step %c1_14 { + %c0_15 = arith.constant 0 : index + %c3_16 = arith.constant 3 : index + %c1_17 = arith.constant 1 : index + scf.for %arg2 = %c0_15 to %c3_16 step %c1_17 { + %c0_18 = arith.constant 0 : index + %c3_19 = arith.constant 3 : index + %c1_20 = arith.constant 1 : index + scf.for %arg3 = %c0_18 to %c3_19 step %c1_20 { + %2 = arith.muli %arg2, %c3 : index + %3 = arith.muli %2, %c2 : index + %4 = arith.muli %arg3, %c2 : index + %5 = arith.addi %3, %4 : index + %6 = arith.addi %5, %arg1 : index + %7 = memref.load %1[%arg0, %arg1, %arg2, %arg3] : memref<2x2x3x3xf32> + memref.store %7, %alloc_1[%6, %arg0] : memref<18x2xf32> + } + } + } + } + %c3_i64_8 = arith.constant 3 : i64 + gemmini.tile_conv %alloc_0 %alloc_1 %alloc_2 %alloc_3 %c3_i64 %c3_i64 %c3_i64_8 : memref<2x5x5x2xf32> memref<18x2xf32> memref<2xi32> memref<18x2xf32> i64 i64 i64 + %c0_9 = arith.constant 0 : index + %c2_10 = arith.constant 2 : index + %c1_11 = arith.constant 1 : index + scf.for %arg0 = %c0_9 to %c2_10 step %c1_11 { + %c0_12 = arith.constant 0 : index + %c2_13 = arith.constant 2 : index + %c1_14 = arith.constant 1 : index + scf.for %arg1 = %c0_12 to %c2_13 step %c1_14 { + %c0_15 = arith.constant 0 : index + %c3_16 = arith.constant 3 : index + %c1_17 = arith.constant 1 : index + scf.for %arg2 = %c0_15 to %c3_16 step %c1_17 { + %c0_18 = arith.constant 0 : index + %c3_19 = arith.constant 3 : index + %c1_20 = arith.constant 1 : index + scf.for %arg3 = %c0_18 to %c3_19 step %c1_20 { + %c3_21 = arith.constant 3 : index + %2 = arith.muli %arg0, %c3_21 : index + %3 = arith.muli %2, %c3_21 : index + %4 = arith.muli %arg2, %c3_21 : index + %5 = arith.addi %3, %4 : index + %6 = arith.addi %5, %arg3 : index + %7 = memref.load %alloc_3[%6, %arg1] : memref<18x2xf32> + memref.store %7, %alloc[%arg0, %arg1, %arg2, %arg3] : memref<2x2x3x3xf32> + } + } + } + } + memref.dealloc %alloc_0 : memref<2x5x5x2xf32> + memref.dealloc %alloc_1 : memref<18x2xf32> + memref.dealloc %alloc_3 : memref<18x2xf32> + memref.dealloc %alloc_2 : memref<2xi32> + gemmini.print %alloc : memref<2x2x3x3xf32> + return %c0_i8 : i8 + } +} + + +// -----// IR Dump After (anonymous namespace)::LowerGemminiToLLVMPass (lower-gemmini) //----- // +module { + llvm.mlir.global internal constant @nl("\0A\00") {addr_space = 0 : i32} + llvm.mlir.global internal constant @frmt_spec("%f \00") {addr_space = 0 : i32} + llvm.func @printf(!llvm.ptr, ...) -> i32 + llvm.func @free(!llvm.ptr) + llvm.func @malloc(i64) -> !llvm.ptr + llvm.mlir.global private @input(dense<[[[[1.000000e+00, 0.000000e+00, -1.000000e+00, 0.000000e+00, 1.000000e+00], [1.000000e+00, 0.000000e+00, -1.000000e+00, 0.000000e+00, 1.000000e+00], [1.000000e+00, 0.000000e+00, -1.000000e+00, 0.000000e+00, 1.000000e+00], [1.000000e+00, 0.000000e+00, -1.000000e+00, 0.000000e+00, 1.000000e+00], [-1.000000e+00, 0.000000e+00, 1.000000e+00, 0.000000e+00, -1.000000e+00]], [[-1.000000e+00, 0.000000e+00, 1.000000e+00, 0.000000e+00, -1.000000e+00], [-1.000000e+00, 0.000000e+00, 1.000000e+00, 0.000000e+00, -1.000000e+00], [-1.000000e+00, 0.000000e+00, 1.000000e+00, 0.000000e+00, -1.000000e+00], [-1.000000e+00, 0.000000e+00, 1.000000e+00, 0.000000e+00, -1.000000e+00], [-1.000000e+00, 0.000000e+00, 1.000000e+00, 0.000000e+00, -1.000000e+00]]], [[[1.000000e+00, 0.000000e+00, 2.000000e+00, 0.000000e+00, 1.000000e+00], [1.000000e+00, 0.000000e+00, 2.000000e+00, 0.000000e+00, 1.000000e+00], [1.000000e+00, 0.000000e+00, 2.000000e+00, 0.000000e+00, 1.000000e+00], [1.000000e+00, 0.000000e+00, 2.000000e+00, 0.000000e+00, 1.000000e+00], [-1.000000e+00, 0.000000e+00, 2.000000e+00, 0.000000e+00, -1.000000e+00]], [[-1.000000e+00, 0.000000e+00, 2.000000e+00, 0.000000e+00, -1.000000e+00], [-1.000000e+00, 0.000000e+00, 2.000000e+00, 0.000000e+00, -1.000000e+00], [-1.000000e+00, 0.000000e+00, 2.000000e+00, 0.000000e+00, -1.000000e+00], [-1.000000e+00, 0.000000e+00, 2.000000e+00, 0.000000e+00, -1.000000e+00], [-1.000000e+00, 0.000000e+00, 2.000000e+00, 0.000000e+00, -1.000000e+00]]]]> : tensor<2x2x5x5xf32>) {addr_space = 0 : i32} : !llvm.array<2 x array<2 x array<5 x array<5 x f32>>>> + llvm.mlir.global private @weight(dense<[[[[1.000000e+00, 2.000000e+00, 3.000000e+00], [3.000000e+00, 2.000000e+00, 1.000000e+00], [1.000000e+00, 2.000000e+00, 3.000000e+00]], [[3.000000e+00, 2.000000e+00, 1.000000e+00], [1.000000e+00, 2.000000e+00, 3.000000e+00], [3.000000e+00, 2.000000e+00, 1.000000e+00]]], [[[1.000000e+00, 2.000000e+00, 3.000000e+00], [3.000000e+00, 2.000000e+00, 1.000000e+00], [1.000000e+00, 2.000000e+00, 3.000000e+00]], [[3.000000e+00, 2.000000e+00, 1.000000e+00], [1.000000e+00, 2.000000e+00, 3.000000e+00], [3.000000e+00, 2.000000e+00, 1.000000e+00]]]]> : tensor<2x2x3x3xf32>) {addr_space = 0 : i32} : !llvm.array<2 x array<2 x array<3 x array<3 x f32>>>> + llvm.func @main() -> i8 { + %0 = llvm.mlir.constant(0 : i8) : i8 + %1 = llvm.mlir.constant(2 : index) : i64 + %2 = llvm.mlir.constant(2 : index) : i64 + %3 = llvm.mlir.constant(5 : index) : i64 + %4 = llvm.mlir.constant(5 : index) : i64 + %5 = llvm.mlir.constant(1 : index) : i64 + %6 = llvm.mlir.constant(25 : index) : i64 + %7 = llvm.mlir.constant(50 : index) : i64 + %8 = llvm.mlir.constant(100 : index) : i64 + %9 = llvm.mlir.zero : !llvm.ptr + %10 = llvm.getelementptr %9[%8] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %11 = llvm.ptrtoint %10 : !llvm.ptr to i64 + %12 = llvm.mlir.addressof @input : !llvm.ptr + %13 = llvm.getelementptr %12[0, 0, 0, 0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<2 x array<2 x array<5 x array<5 x f32>>>> + %14 = llvm.mlir.constant(3735928559 : index) : i64 + %15 = llvm.inttoptr %14 : i64 to !llvm.ptr + %16 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %17 = llvm.insertvalue %15, %16[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %18 = llvm.insertvalue %13, %17[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %19 = llvm.mlir.constant(0 : index) : i64 + %20 = llvm.insertvalue %19, %18[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %21 = llvm.insertvalue %1, %20[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %22 = llvm.insertvalue %2, %21[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %23 = llvm.insertvalue %3, %22[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %24 = llvm.insertvalue %4, %23[3, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %25 = llvm.insertvalue %7, %24[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %26 = llvm.insertvalue %6, %25[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %27 = llvm.insertvalue %4, %26[4, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %28 = llvm.insertvalue %5, %27[4, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %29 = llvm.mlir.constant(2 : index) : i64 + %30 = llvm.mlir.constant(2 : index) : i64 + %31 = llvm.mlir.constant(3 : index) : i64 + %32 = llvm.mlir.constant(3 : index) : i64 + %33 = llvm.mlir.constant(1 : index) : i64 + %34 = llvm.mlir.constant(9 : index) : i64 + %35 = llvm.mlir.constant(18 : index) : i64 + %36 = llvm.mlir.constant(36 : index) : i64 + %37 = llvm.mlir.zero : !llvm.ptr + %38 = llvm.getelementptr %37[%36] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %39 = llvm.ptrtoint %38 : !llvm.ptr to i64 + %40 = llvm.mlir.addressof @weight : !llvm.ptr + %41 = llvm.getelementptr %40[0, 0, 0, 0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<2 x array<2 x array<3 x array<3 x f32>>>> + %42 = llvm.mlir.constant(3735928559 : index) : i64 + %43 = llvm.inttoptr %42 : i64 to !llvm.ptr + %44 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %45 = llvm.insertvalue %43, %44[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %46 = llvm.insertvalue %41, %45[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %47 = llvm.mlir.constant(0 : index) : i64 + %48 = llvm.insertvalue %47, %46[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %49 = llvm.insertvalue %29, %48[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %50 = llvm.insertvalue %30, %49[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %51 = llvm.insertvalue %31, %50[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %52 = llvm.insertvalue %32, %51[3, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %53 = llvm.insertvalue %35, %52[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %54 = llvm.insertvalue %34, %53[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %55 = llvm.insertvalue %32, %54[4, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %56 = llvm.insertvalue %33, %55[4, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %57 = llvm.mlir.constant(2 : index) : i64 + %58 = llvm.mlir.constant(2 : index) : i64 + %59 = llvm.mlir.constant(3 : index) : i64 + %60 = llvm.mlir.constant(3 : index) : i64 + %61 = llvm.mlir.constant(1 : index) : i64 + %62 = llvm.mlir.constant(9 : index) : i64 + %63 = llvm.mlir.constant(18 : index) : i64 + %64 = llvm.mlir.constant(36 : index) : i64 + %65 = llvm.mlir.zero : !llvm.ptr + %66 = llvm.getelementptr %65[%64] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %67 = llvm.ptrtoint %66 : !llvm.ptr to i64 + %68 = llvm.call @malloc(%67) : (i64) -> !llvm.ptr + %69 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %70 = llvm.insertvalue %68, %69[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %71 = llvm.insertvalue %68, %70[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %72 = llvm.mlir.constant(0 : index) : i64 + %73 = llvm.insertvalue %72, %71[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %74 = llvm.insertvalue %57, %73[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %75 = llvm.insertvalue %58, %74[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %76 = llvm.insertvalue %59, %75[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %77 = llvm.insertvalue %60, %76[3, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %78 = llvm.insertvalue %63, %77[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %79 = llvm.insertvalue %62, %78[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %80 = llvm.insertvalue %60, %79[4, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %81 = llvm.insertvalue %61, %80[4, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %82 = llvm.mlir.constant(2 : index) : i64 + %83 = llvm.mlir.constant(5 : index) : i64 + %84 = llvm.mlir.constant(5 : index) : i64 + %85 = llvm.mlir.constant(2 : index) : i64 + %86 = llvm.mlir.constant(1 : index) : i64 + %87 = llvm.mlir.constant(10 : index) : i64 + %88 = llvm.mlir.constant(50 : index) : i64 + %89 = llvm.mlir.constant(100 : index) : i64 + %90 = llvm.mlir.zero : !llvm.ptr + %91 = llvm.getelementptr %90[%89] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %92 = llvm.ptrtoint %91 : !llvm.ptr to i64 + %93 = llvm.call @malloc(%92) : (i64) -> !llvm.ptr + %94 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %95 = llvm.insertvalue %93, %94[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %96 = llvm.insertvalue %93, %95[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %97 = llvm.mlir.constant(0 : index) : i64 + %98 = llvm.insertvalue %97, %96[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %99 = llvm.insertvalue %82, %98[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %100 = llvm.insertvalue %83, %99[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %101 = llvm.insertvalue %84, %100[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %102 = llvm.insertvalue %85, %101[3, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %103 = llvm.insertvalue %88, %102[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %104 = llvm.insertvalue %87, %103[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %105 = llvm.insertvalue %85, %104[4, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %106 = llvm.insertvalue %86, %105[4, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %107 = llvm.mlir.constant(18 : index) : i64 + %108 = llvm.mlir.constant(2 : index) : i64 + %109 = llvm.mlir.constant(1 : index) : i64 + %110 = llvm.mlir.constant(36 : index) : i64 + %111 = llvm.mlir.zero : !llvm.ptr + %112 = llvm.getelementptr %111[%110] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %113 = llvm.ptrtoint %112 : !llvm.ptr to i64 + %114 = llvm.call @malloc(%113) : (i64) -> !llvm.ptr + %115 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %116 = llvm.insertvalue %114, %115[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %117 = llvm.insertvalue %114, %116[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %118 = llvm.mlir.constant(0 : index) : i64 + %119 = llvm.insertvalue %118, %117[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %120 = llvm.insertvalue %107, %119[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %121 = llvm.insertvalue %108, %120[3, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %122 = llvm.insertvalue %108, %121[4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %123 = llvm.insertvalue %109, %122[4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %124 = llvm.mlir.constant(2 : index) : i64 + %125 = llvm.mlir.constant(1 : index) : i64 + %126 = llvm.mlir.zero : !llvm.ptr + %127 = llvm.getelementptr %126[%124] : (!llvm.ptr, i64) -> !llvm.ptr, i32 + %128 = llvm.ptrtoint %127 : !llvm.ptr to i64 + %129 = llvm.call @malloc(%128) : (i64) -> !llvm.ptr + %130 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + %131 = llvm.insertvalue %129, %130[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + %132 = llvm.insertvalue %129, %131[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + %133 = llvm.mlir.constant(0 : index) : i64 + %134 = llvm.insertvalue %133, %132[2] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + %135 = llvm.insertvalue %124, %134[3, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + %136 = llvm.insertvalue %125, %135[4, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + %137 = llvm.mlir.constant(18 : index) : i64 + %138 = llvm.mlir.constant(2 : index) : i64 + %139 = llvm.mlir.constant(1 : index) : i64 + %140 = llvm.mlir.constant(36 : index) : i64 + %141 = llvm.mlir.zero : !llvm.ptr + %142 = llvm.getelementptr %141[%140] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %143 = llvm.ptrtoint %142 : !llvm.ptr to i64 + %144 = llvm.call @malloc(%143) : (i64) -> !llvm.ptr + %145 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %146 = llvm.insertvalue %144, %145[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %147 = llvm.insertvalue %144, %146[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %148 = llvm.mlir.constant(0 : index) : i64 + %149 = llvm.insertvalue %148, %147[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %150 = llvm.insertvalue %137, %149[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %151 = llvm.insertvalue %138, %150[3, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %152 = llvm.insertvalue %138, %151[4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %153 = llvm.insertvalue %139, %152[4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %154 = llvm.mlir.constant(3 : i64) : i64 + %155 = llvm.mlir.constant(3 : index) : i64 + %156 = llvm.mlir.constant(2 : index) : i64 + %157 = llvm.mlir.constant(0 : index) : i64 + %158 = llvm.mlir.constant(2 : index) : i64 + %159 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb1(%157 : i64) + ^bb1(%160: i64): // 2 preds: ^bb0, ^bb11 + %161 = llvm.icmp "slt" %160, %158 : i64 + llvm.cond_br %161, ^bb2, ^bb12 + ^bb2: // pred: ^bb1 + %162 = llvm.mlir.constant(0 : index) : i64 + %163 = llvm.mlir.constant(2 : index) : i64 + %164 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb3(%162 : i64) + ^bb3(%165: i64): // 2 preds: ^bb2, ^bb10 + %166 = llvm.icmp "slt" %165, %163 : i64 + llvm.cond_br %166, ^bb4, ^bb11 + ^bb4: // pred: ^bb3 + %167 = llvm.mlir.constant(0 : index) : i64 + %168 = llvm.mlir.constant(5 : index) : i64 + %169 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb5(%167 : i64) + ^bb5(%170: i64): // 2 preds: ^bb4, ^bb9 + %171 = llvm.icmp "slt" %170, %168 : i64 + llvm.cond_br %171, ^bb6, ^bb10 + ^bb6: // pred: ^bb5 + %172 = llvm.mlir.constant(0 : index) : i64 + %173 = llvm.mlir.constant(5 : index) : i64 + %174 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb7(%172 : i64) + ^bb7(%175: i64): // 2 preds: ^bb6, ^bb8 + %176 = llvm.icmp "slt" %175, %173 : i64 + llvm.cond_br %176, ^bb8, ^bb9 + ^bb8: // pred: ^bb7 + %177 = llvm.extractvalue %28[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %178 = llvm.mlir.constant(50 : index) : i64 + %179 = llvm.mul %160, %178 : i64 + %180 = llvm.mlir.constant(25 : index) : i64 + %181 = llvm.mul %165, %180 : i64 + %182 = llvm.add %179, %181 : i64 + %183 = llvm.mlir.constant(5 : index) : i64 + %184 = llvm.mul %170, %183 : i64 + %185 = llvm.add %182, %184 : i64 + %186 = llvm.add %185, %175 : i64 + %187 = llvm.getelementptr %177[%186] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %188 = llvm.load %187 : !llvm.ptr -> f32 + %189 = llvm.extractvalue %106[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %190 = llvm.mlir.constant(50 : index) : i64 + %191 = llvm.mul %160, %190 : i64 + %192 = llvm.mlir.constant(10 : index) : i64 + %193 = llvm.mul %170, %192 : i64 + %194 = llvm.add %191, %193 : i64 + %195 = llvm.mlir.constant(2 : index) : i64 + %196 = llvm.mul %175, %195 : i64 + %197 = llvm.add %194, %196 : i64 + %198 = llvm.add %197, %165 : i64 + %199 = llvm.getelementptr %189[%198] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.store %188, %199 : f32, !llvm.ptr + %200 = llvm.add %175, %174 : i64 + llvm.br ^bb7(%200 : i64) + ^bb9: // pred: ^bb7 + %201 = llvm.add %170, %169 : i64 + llvm.br ^bb5(%201 : i64) + ^bb10: // pred: ^bb5 + %202 = llvm.add %165, %164 : i64 + llvm.br ^bb3(%202 : i64) + ^bb11: // pred: ^bb3 + %203 = llvm.add %160, %159 : i64 + llvm.br ^bb1(%203 : i64) + ^bb12: // pred: ^bb1 + %204 = llvm.mlir.constant(0 : index) : i64 + %205 = llvm.mlir.constant(2 : index) : i64 + %206 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb13(%204 : i64) + ^bb13(%207: i64): // 2 preds: ^bb12, ^bb23 + %208 = llvm.icmp "slt" %207, %205 : i64 + llvm.cond_br %208, ^bb14, ^bb24 + ^bb14: // pred: ^bb13 + %209 = llvm.mlir.constant(0 : index) : i64 + %210 = llvm.mlir.constant(2 : index) : i64 + %211 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb15(%209 : i64) + ^bb15(%212: i64): // 2 preds: ^bb14, ^bb22 + %213 = llvm.icmp "slt" %212, %210 : i64 + llvm.cond_br %213, ^bb16, ^bb23 + ^bb16: // pred: ^bb15 + %214 = llvm.mlir.constant(0 : index) : i64 + %215 = llvm.mlir.constant(3 : index) : i64 + %216 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb17(%214 : i64) + ^bb17(%217: i64): // 2 preds: ^bb16, ^bb21 + %218 = llvm.icmp "slt" %217, %215 : i64 + llvm.cond_br %218, ^bb18, ^bb22 + ^bb18: // pred: ^bb17 + %219 = llvm.mlir.constant(0 : index) : i64 + %220 = llvm.mlir.constant(3 : index) : i64 + %221 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb19(%219 : i64) + ^bb19(%222: i64): // 2 preds: ^bb18, ^bb20 + %223 = llvm.icmp "slt" %222, %220 : i64 + llvm.cond_br %223, ^bb20, ^bb21 + ^bb20: // pred: ^bb19 + %224 = llvm.mul %217, %155 : i64 + %225 = llvm.mul %224, %156 : i64 + %226 = llvm.mul %222, %156 : i64 + %227 = llvm.add %225, %226 : i64 + %228 = llvm.add %227, %212 : i64 + %229 = llvm.extractvalue %56[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %230 = llvm.mlir.constant(18 : index) : i64 + %231 = llvm.mul %207, %230 : i64 + %232 = llvm.mlir.constant(9 : index) : i64 + %233 = llvm.mul %212, %232 : i64 + %234 = llvm.add %231, %233 : i64 + %235 = llvm.mlir.constant(3 : index) : i64 + %236 = llvm.mul %217, %235 : i64 + %237 = llvm.add %234, %236 : i64 + %238 = llvm.add %237, %222 : i64 + %239 = llvm.getelementptr %229[%238] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %240 = llvm.load %239 : !llvm.ptr -> f32 + %241 = llvm.extractvalue %123[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %242 = llvm.mlir.constant(2 : index) : i64 + %243 = llvm.mul %228, %242 : i64 + %244 = llvm.add %243, %207 : i64 + %245 = llvm.getelementptr %241[%244] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.store %240, %245 : f32, !llvm.ptr + %246 = llvm.add %222, %221 : i64 + llvm.br ^bb19(%246 : i64) + ^bb21: // pred: ^bb19 + %247 = llvm.add %217, %216 : i64 + llvm.br ^bb17(%247 : i64) + ^bb22: // pred: ^bb17 + %248 = llvm.add %212, %211 : i64 + llvm.br ^bb15(%248 : i64) + ^bb23: // pred: ^bb15 + %249 = llvm.add %207, %206 : i64 + llvm.br ^bb13(%249 : i64) + ^bb24: // pred: ^bb13 + %250 = llvm.mlir.constant(3 : i64) : i64 + %251 = llvm.extractvalue %106[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %252 = llvm.ptrtoint %251 : !llvm.ptr to i64 + %253 = llvm.extractvalue %153[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %254 = llvm.ptrtoint %253 : !llvm.ptr to i64 + %255 = llvm.extractvalue %136[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + %256 = llvm.ptrtoint %255 : !llvm.ptr to i64 + %257 = llvm.extractvalue %123[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %258 = llvm.ptrtoint %257 : !llvm.ptr to i64 + %259 = llvm.mlir.constant(2 : i64) : i64 + %260 = llvm.mlir.constant(2 : i64) : i64 + %261 = llvm.mlir.constant(4575657221408423938 : i64) : i64 + "gemmini.intr.config_st"(%260, %261) : (i64, i64) -> () + %262 = llvm.mlir.constant(65540 : i64) : i64 + %263 = llvm.mlir.constant(281474976710656 : i64) : i64 + "gemmini.intr.config_ex"(%262, %263) : (i64, i64) -> () + %264 = llvm.mlir.constant(0 : i64) : i64 + %265 = llvm.mlir.constant(0 : i64) : i64 + %266 = llvm.mlir.constant(0 : i64) : i64 + %267 = llvm.mlir.constant(0 : i64) : i64 + %268 = llvm.mlir.constant(562958543683586 : i64) : i64 + %269 = llvm.mlir.constant(4295163907 : i64) : i64 + "gemmini.intr.loop_conv_ws_config1"(%268, %269) : (i64, i64) -> () + %270 = llvm.mlir.constant(844429225164800 : i64) : i64 + %271 = llvm.mlir.constant(562962838519810 : i64) : i64 + "gemmini.intr.loop_conv_ws_config2"(%270, %271) : (i64, i64) -> () + %272 = llvm.mlir.constant(844437815164928 : i64) : i64 + %273 = llvm.mlir.constant(0 : i64) : i64 + "gemmini.intr.loop_conv_ws_config3"(%272, %273) : (i64, i64) -> () + %274 = llvm.mlir.constant(844424930131968 : i64) : i64 + %275 = llvm.mlir.constant(65539 : i64) : i64 + "gemmini.intr.loop_conv_ws_config4"(%274, %275) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config5"(%258, %254) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config6"(%256, %252) : (i64, i64) -> () + %276 = llvm.mlir.constant(768 : i64) : i64 + %277 = llvm.mlir.constant(1 : i64) : i64 + "gemmini.intr.loop_conv_ws"(%276, %277) : (i64, i64) -> () + %278 = llvm.mlir.constant(0 : i64) : i64 + "gemmini.intr.flush"(%278, %278) : (i64, i64) -> () + %279 = llvm.mlir.constant(0 : index) : i64 + %280 = llvm.mlir.constant(2 : index) : i64 + %281 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb25(%279 : i64) + ^bb25(%282: i64): // 2 preds: ^bb24, ^bb35 + %283 = llvm.icmp "slt" %282, %280 : i64 + llvm.cond_br %283, ^bb26, ^bb36 + ^bb26: // pred: ^bb25 + %284 = llvm.mlir.constant(0 : index) : i64 + %285 = llvm.mlir.constant(2 : index) : i64 + %286 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb27(%284 : i64) + ^bb27(%287: i64): // 2 preds: ^bb26, ^bb34 + %288 = llvm.icmp "slt" %287, %285 : i64 + llvm.cond_br %288, ^bb28, ^bb35 + ^bb28: // pred: ^bb27 + %289 = llvm.mlir.constant(0 : index) : i64 + %290 = llvm.mlir.constant(3 : index) : i64 + %291 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb29(%289 : i64) + ^bb29(%292: i64): // 2 preds: ^bb28, ^bb33 + %293 = llvm.icmp "slt" %292, %290 : i64 + llvm.cond_br %293, ^bb30, ^bb34 + ^bb30: // pred: ^bb29 + %294 = llvm.mlir.constant(0 : index) : i64 + %295 = llvm.mlir.constant(3 : index) : i64 + %296 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb31(%294 : i64) + ^bb31(%297: i64): // 2 preds: ^bb30, ^bb32 + %298 = llvm.icmp "slt" %297, %295 : i64 + llvm.cond_br %298, ^bb32, ^bb33 + ^bb32: // pred: ^bb31 + %299 = llvm.mlir.constant(3 : index) : i64 + %300 = llvm.mul %282, %299 : i64 + %301 = llvm.mul %300, %299 : i64 + %302 = llvm.mul %292, %299 : i64 + %303 = llvm.add %301, %302 : i64 + %304 = llvm.add %303, %297 : i64 + %305 = llvm.extractvalue %153[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %306 = llvm.mlir.constant(2 : index) : i64 + %307 = llvm.mul %304, %306 : i64 + %308 = llvm.add %307, %287 : i64 + %309 = llvm.getelementptr %305[%308] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %310 = llvm.load %309 : !llvm.ptr -> f32 + %311 = llvm.extractvalue %81[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %312 = llvm.mlir.constant(18 : index) : i64 + %313 = llvm.mul %282, %312 : i64 + %314 = llvm.mlir.constant(9 : index) : i64 + %315 = llvm.mul %287, %314 : i64 + %316 = llvm.add %313, %315 : i64 + %317 = llvm.mlir.constant(3 : index) : i64 + %318 = llvm.mul %292, %317 : i64 + %319 = llvm.add %316, %318 : i64 + %320 = llvm.add %319, %297 : i64 + %321 = llvm.getelementptr %311[%320] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.store %310, %321 : f32, !llvm.ptr + %322 = llvm.add %297, %296 : i64 + llvm.br ^bb31(%322 : i64) + ^bb33: // pred: ^bb31 + %323 = llvm.add %292, %291 : i64 + llvm.br ^bb29(%323 : i64) + ^bb34: // pred: ^bb29 + %324 = llvm.add %287, %286 : i64 + llvm.br ^bb27(%324 : i64) + ^bb35: // pred: ^bb27 + %325 = llvm.add %282, %281 : i64 + llvm.br ^bb25(%325 : i64) + ^bb36: // pred: ^bb25 + %326 = llvm.extractvalue %106[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + llvm.call @free(%326) : (!llvm.ptr) -> () + %327 = llvm.extractvalue %123[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + llvm.call @free(%327) : (!llvm.ptr) -> () + %328 = llvm.extractvalue %153[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + llvm.call @free(%328) : (!llvm.ptr) -> () + %329 = llvm.extractvalue %136[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + llvm.call @free(%329) : (!llvm.ptr) -> () + %330 = llvm.mlir.addressof @frmt_spec : !llvm.ptr + %331 = llvm.mlir.constant(0 : index) : i64 + %332 = llvm.getelementptr %330[%331, %331] : (!llvm.ptr, i64, i64) -> !llvm.ptr, !llvm.array<4 x i8> + %333 = llvm.mlir.addressof @nl : !llvm.ptr + %334 = llvm.mlir.constant(0 : index) : i64 + %335 = llvm.getelementptr %333[%334, %334] : (!llvm.ptr, i64, i64) -> !llvm.ptr, !llvm.array<2 x i8> + %336 = llvm.mlir.constant(0 : index) : i64 + %337 = llvm.mlir.constant(2 : index) : i64 + %338 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb37(%336 : i64) + ^bb37(%339: i64): // 2 preds: ^bb36, ^bb47 + %340 = llvm.icmp "slt" %339, %337 : i64 + llvm.cond_br %340, ^bb38, ^bb48 + ^bb38: // pred: ^bb37 + %341 = llvm.mlir.constant(0 : index) : i64 + %342 = llvm.mlir.constant(2 : index) : i64 + %343 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb39(%341 : i64) + ^bb39(%344: i64): // 2 preds: ^bb38, ^bb46 + %345 = llvm.icmp "slt" %344, %342 : i64 + llvm.cond_br %345, ^bb40, ^bb47 + ^bb40: // pred: ^bb39 + %346 = llvm.mlir.constant(0 : index) : i64 + %347 = llvm.mlir.constant(3 : index) : i64 + %348 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb41(%346 : i64) + ^bb41(%349: i64): // 2 preds: ^bb40, ^bb45 + %350 = llvm.icmp "slt" %349, %347 : i64 + llvm.cond_br %350, ^bb42, ^bb46 + ^bb42: // pred: ^bb41 + %351 = llvm.mlir.constant(0 : index) : i64 + %352 = llvm.mlir.constant(3 : index) : i64 + %353 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb43(%351 : i64) + ^bb43(%354: i64): // 2 preds: ^bb42, ^bb44 + %355 = llvm.icmp "slt" %354, %352 : i64 + llvm.cond_br %355, ^bb44, ^bb45 + ^bb44: // pred: ^bb43 + %356 = llvm.extractvalue %81[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %357 = llvm.mlir.constant(18 : index) : i64 + %358 = llvm.mul %339, %357 : i64 + %359 = llvm.mlir.constant(9 : index) : i64 + %360 = llvm.mul %344, %359 : i64 + %361 = llvm.add %358, %360 : i64 + %362 = llvm.mlir.constant(3 : index) : i64 + %363 = llvm.mul %349, %362 : i64 + %364 = llvm.add %361, %363 : i64 + %365 = llvm.add %364, %354 : i64 + %366 = llvm.getelementptr %356[%365] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %367 = llvm.load %366 : !llvm.ptr -> f32 + %368 = llvm.fpext %367 : f32 to f64 + %369 = llvm.call @printf(%332, %368) vararg(!llvm.func) : (!llvm.ptr, f64) -> i32 + %370 = llvm.add %354, %353 : i64 + llvm.br ^bb43(%370 : i64) + ^bb45: // pred: ^bb43 + %371 = llvm.call @printf(%335) vararg(!llvm.func) : (!llvm.ptr) -> i32 + %372 = llvm.add %349, %348 : i64 + llvm.br ^bb41(%372 : i64) + ^bb46: // pred: ^bb41 + %373 = llvm.call @printf(%335) vararg(!llvm.func) : (!llvm.ptr) -> i32 + %374 = llvm.add %344, %343 : i64 + llvm.br ^bb39(%374 : i64) + ^bb47: // pred: ^bb39 + %375 = llvm.call @printf(%335) vararg(!llvm.func) : (!llvm.ptr) -> i32 + %376 = llvm.add %339, %338 : i64 + llvm.br ^bb37(%376 : i64) + ^bb48: // pred: ^bb37 + llvm.return %0 : i8 + } +} + + From 5c7aad33632a83d429593c878822d0b106946a8e Mon Sep 17 00:00:00 2001 From: sparsh Date: Tue, 9 Dec 2025 01:59:05 -0800 Subject: [PATCH 03/13] Add helper script to dump Gemmini lowering IR --- experiments/gemmini/scripts/dump_one.sh | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100755 experiments/gemmini/scripts/dump_one.sh diff --git a/experiments/gemmini/scripts/dump_one.sh b/experiments/gemmini/scripts/dump_one.sh new file mode 100755 index 0000000..eb68498 --- /dev/null +++ b/experiments/gemmini/scripts/dump_one.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env bash +set -euo pipefail + +BUDDY_OPT="${BUDDY_OPT:-$HOME/work/buddy-mlir/build/bin/buddy-opt}" +IN="$1" +OUTDIR="${2:-experiments/gemmini/logs}" +mkdir -p "$OUTDIR" + +base="$(basename "$IN")" +LOG="$OUTDIR/${base}.print-after-all.mlir" + +"$BUDDY_OPT" "$IN" \ + --convert-linalg-to-gemmini \ + --lower-gemmini \ + --mlir-print-ir-after-all \ + 2> "$LOG" > /dev/null + +echo "Wrote: $LOG" +grep -n "gemmini\\." "$LOG" | head -n 20 || true From 8f3a9428e8db3afa9a2e6ccab89f91bb172f4a66 Mon Sep 17 00:00:00 2001 From: sparsh Date: Tue, 9 Dec 2025 02:01:16 -0800 Subject: [PATCH 04/13] Add IR dump for tile-matmul-ws-softmax.mlir --- .../inputs/tile-matmul-ws-softmax.mlir | 49 +++ ...atmul-ws-softmax.mlir.print-after-all.mlir | 323 ++++++++++++++++++ 2 files changed, 372 insertions(+) create mode 100644 experiments/gemmini/inputs/tile-matmul-ws-softmax.mlir create mode 100644 experiments/gemmini/logs/tile-matmul-ws-softmax.mlir.print-after-all.mlir diff --git a/experiments/gemmini/inputs/tile-matmul-ws-softmax.mlir b/experiments/gemmini/inputs/tile-matmul-ws-softmax.mlir new file mode 100644 index 0000000..c81bccc --- /dev/null +++ b/experiments/gemmini/inputs/tile-matmul-ws-softmax.mlir @@ -0,0 +1,49 @@ +// RUN: buddy-opt %s \ +// RUN: --lower-gemmini | \ +// RUN: FileCheck %s + +memref.global "private" @g1 : memref<5x5xi8> = dense<[[1, 0, 0, 1, 0], [1, -1, 1, 0, 0], [-1, 0, 1, -1, 1], [1, 0, 0, 1, 0], [-1, 0, 0, -1, 0]]> +memref.global "private" @g2 : memref<5x5xi8> = dense<[[1, -1, 0, 0, 1], [1, 0, -1, 0, -1], [-1, -1, 0, -1, 1], [-1, 0, 0, 1, 0], [1, 0, 0, -1, 0]]> + + +func.func @main() -> i8 { + %i0 = arith.constant 0 : i8 + %i1I8 = arith.constant 1 : i8 + %minus1 = arith.constant -2 : i8 + %i2I8 = arith.constant 2 : i8 + %i2I32 = arith.constant 2 : i32 + %dI32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %aArray = memref.get_global @g1 : memref<5x5xi8> + %bArray = memref.get_global @g2 : memref<5x5xi8> + %cArray = memref.alloc() : memref<5x5xi8> + %dArray = memref.alloc() : memref<5x5xi32> + %dim_I = memref.dim %aArray, %c0 : memref<5x5xi8> + %dim_J = memref.dim %bArray, %c1 : memref<5x5xi8> + %dim_K = memref.dim %aArray, %c1 : memref<5x5xi8> + + scf.for %i3 = %c0 to %dim_I step %c1 { + scf.for %j3 = %c0 to %dim_J step %c1 { + memref.store %dI32, %dArray[%i3, %j3] : memref<5x5xi32> + } + } + + gemmini.tile_matmul %aArray %bArray %cArray %dArray {dataflow=1}: memref<5x5xi8> memref<5x5xi8> memref<5x5xi8> memref<5x5xi32> + gemmini.print %cArray : memref<5x5xi8> + + // CHECK: "gemmini.intr.config_ex" + // CHECK: "gemmini.intr.config_st" + // CHECK: "gemmini.intr.config_ld" + // CHECK: "gemmini.intr.config_norm" + // CHECK: "gemmini.intr.loop_ws_config_bounds" + // CHECK: "gemmini.intr.loop_ws_config_addrs_ab" + // CHECK: "gemmini.intr.loop_ws_config_addrs_dc" + // CHECK: "gemmini.intr.loop_ws_config_strides_ab" + // CHECK: "gemmini.intr.loop_ws_config_strides_dc" + // CHECK: "gemmini.intr.loop_ws" + // CHECk: "gemmini.intr.flush" + gemmini.tile_matmul %aArray %bArray %cArray %dArray {dataflow=1, act=4, bertScale=0.05:f32}: memref<5x5xi8> memref<5x5xi8> memref<5x5xi8> memref<5x5xi32> + gemmini.print %cArray : memref<5x5xi8> + return %i0 : i8 +} diff --git a/experiments/gemmini/logs/tile-matmul-ws-softmax.mlir.print-after-all.mlir b/experiments/gemmini/logs/tile-matmul-ws-softmax.mlir.print-after-all.mlir new file mode 100644 index 0000000..abf64aa --- /dev/null +++ b/experiments/gemmini/logs/tile-matmul-ws-softmax.mlir.print-after-all.mlir @@ -0,0 +1,323 @@ +// -----// IR Dump After (anonymous namespace)::LowerLinalgToGemminiPass (convert-linalg-to-gemmini) //----- // +module { + memref.global "private" @g1 : memref<5x5xi8> = dense<[[1, 0, 0, 1, 0], [1, -1, 1, 0, 0], [-1, 0, 1, -1, 1], [1, 0, 0, 1, 0], [-1, 0, 0, -1, 0]]> + memref.global "private" @g2 : memref<5x5xi8> = dense<[[1, -1, 0, 0, 1], [1, 0, -1, 0, -1], [-1, -1, 0, -1, 1], [-1, 0, 0, 1, 0], [1, 0, 0, -1, 0]]> + func.func @main() -> i8 { + %c0_i8 = arith.constant 0 : i8 + %c1_i8 = arith.constant 1 : i8 + %c-2_i8 = arith.constant -2 : i8 + %c2_i8 = arith.constant 2 : i8 + %c2_i32 = arith.constant 2 : i32 + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %0 = memref.get_global @g1 : memref<5x5xi8> + %1 = memref.get_global @g2 : memref<5x5xi8> + %alloc = memref.alloc() : memref<5x5xi8> + %alloc_0 = memref.alloc() : memref<5x5xi32> + %dim = memref.dim %0, %c0 : memref<5x5xi8> + %dim_1 = memref.dim %1, %c1 : memref<5x5xi8> + %dim_2 = memref.dim %0, %c1 : memref<5x5xi8> + scf.for %arg0 = %c0 to %dim step %c1 { + scf.for %arg1 = %c0 to %dim_1 step %c1 { + memref.store %c0_i32, %alloc_0[%arg0, %arg1] : memref<5x5xi32> + } + } + gemmini.tile_matmul %0 %1 %alloc %alloc_0 : memref<5x5xi8> memref<5x5xi8> memref<5x5xi8> memref<5x5xi32> + gemmini.print %alloc : memref<5x5xi8> + gemmini.tile_matmul %0 %1 %alloc %alloc_0 {act = 4 : i64, bertScale = 5.000000e-02 : f32} : memref<5x5xi8> memref<5x5xi8> memref<5x5xi8> memref<5x5xi32> + gemmini.print %alloc : memref<5x5xi8> + return %c0_i8 : i8 + } +} + + +// -----// IR Dump After (anonymous namespace)::LowerGemminiToLLVMPass (lower-gemmini) //----- // +module { + llvm.mlir.global internal constant @nl("\0A\00") {addr_space = 0 : i32} + llvm.mlir.global internal constant @frmt_spec("%d \00") {addr_space = 0 : i32} + llvm.func @printf(!llvm.ptr, ...) -> i32 + llvm.func @malloc(i64) -> !llvm.ptr + llvm.mlir.global private @g1(dense<[[1, 0, 0, 1, 0], [1, -1, 1, 0, 0], [-1, 0, 1, -1, 1], [1, 0, 0, 1, 0], [-1, 0, 0, -1, 0]]> : tensor<5x5xi8>) {addr_space = 0 : i32} : !llvm.array<5 x array<5 x i8>> + llvm.mlir.global private @g2(dense<[[1, -1, 0, 0, 1], [1, 0, -1, 0, -1], [-1, -1, 0, -1, 1], [-1, 0, 0, 1, 0], [1, 0, 0, -1, 0]]> : tensor<5x5xi8>) {addr_space = 0 : i32} : !llvm.array<5 x array<5 x i8>> + llvm.func @main() -> i8 { + %0 = llvm.mlir.constant(0 : i8) : i8 + %1 = llvm.mlir.constant(1 : i8) : i8 + %2 = llvm.mlir.constant(-2 : i8) : i8 + %3 = llvm.mlir.constant(2 : i8) : i8 + %4 = llvm.mlir.constant(2 : i32) : i32 + %5 = llvm.mlir.constant(0 : i32) : i32 + %6 = llvm.mlir.constant(0 : index) : i64 + %7 = llvm.mlir.constant(1 : index) : i64 + %8 = llvm.mlir.constant(5 : index) : i64 + %9 = llvm.mlir.constant(5 : index) : i64 + %10 = llvm.mlir.constant(1 : index) : i64 + %11 = llvm.mlir.constant(25 : index) : i64 + %12 = llvm.mlir.zero : !llvm.ptr + %13 = llvm.getelementptr %12[%11] : (!llvm.ptr, i64) -> !llvm.ptr, i8 + %14 = llvm.ptrtoint %13 : !llvm.ptr to i64 + %15 = llvm.mlir.addressof @g1 : !llvm.ptr + %16 = llvm.getelementptr %15[0, 0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<5 x array<5 x i8>> + %17 = llvm.mlir.constant(3735928559 : index) : i64 + %18 = llvm.inttoptr %17 : i64 to !llvm.ptr + %19 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %20 = llvm.insertvalue %18, %19[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %21 = llvm.insertvalue %16, %20[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %22 = llvm.mlir.constant(0 : index) : i64 + %23 = llvm.insertvalue %22, %21[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %24 = llvm.insertvalue %8, %23[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %25 = llvm.insertvalue %9, %24[3, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %26 = llvm.insertvalue %9, %25[4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %27 = llvm.insertvalue %10, %26[4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %28 = llvm.mlir.constant(5 : index) : i64 + %29 = llvm.mlir.constant(5 : index) : i64 + %30 = llvm.mlir.constant(1 : index) : i64 + %31 = llvm.mlir.constant(25 : index) : i64 + %32 = llvm.mlir.zero : !llvm.ptr + %33 = llvm.getelementptr %32[%31] : (!llvm.ptr, i64) -> !llvm.ptr, i8 + %34 = llvm.ptrtoint %33 : !llvm.ptr to i64 + %35 = llvm.mlir.addressof @g2 : !llvm.ptr + %36 = llvm.getelementptr %35[0, 0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<5 x array<5 x i8>> + %37 = llvm.mlir.constant(3735928559 : index) : i64 + %38 = llvm.inttoptr %37 : i64 to !llvm.ptr + %39 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %40 = llvm.insertvalue %38, %39[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %41 = llvm.insertvalue %36, %40[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %42 = llvm.mlir.constant(0 : index) : i64 + %43 = llvm.insertvalue %42, %41[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %44 = llvm.insertvalue %28, %43[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %45 = llvm.insertvalue %29, %44[3, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %46 = llvm.insertvalue %29, %45[4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %47 = llvm.insertvalue %30, %46[4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %48 = llvm.mlir.constant(5 : index) : i64 + %49 = llvm.mlir.constant(5 : index) : i64 + %50 = llvm.mlir.constant(1 : index) : i64 + %51 = llvm.mlir.constant(25 : index) : i64 + %52 = llvm.mlir.zero : !llvm.ptr + %53 = llvm.getelementptr %52[%51] : (!llvm.ptr, i64) -> !llvm.ptr, i8 + %54 = llvm.ptrtoint %53 : !llvm.ptr to i64 + %55 = llvm.call @malloc(%54) : (i64) -> !llvm.ptr + %56 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %57 = llvm.insertvalue %55, %56[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %58 = llvm.insertvalue %55, %57[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %59 = llvm.mlir.constant(0 : index) : i64 + %60 = llvm.insertvalue %59, %58[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %61 = llvm.insertvalue %48, %60[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %62 = llvm.insertvalue %49, %61[3, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %63 = llvm.insertvalue %49, %62[4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %64 = llvm.insertvalue %50, %63[4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %65 = llvm.mlir.constant(5 : index) : i64 + %66 = llvm.mlir.constant(5 : index) : i64 + %67 = llvm.mlir.constant(1 : index) : i64 + %68 = llvm.mlir.constant(25 : index) : i64 + %69 = llvm.mlir.zero : !llvm.ptr + %70 = llvm.getelementptr %69[%68] : (!llvm.ptr, i64) -> !llvm.ptr, i32 + %71 = llvm.ptrtoint %70 : !llvm.ptr to i64 + %72 = llvm.call @malloc(%71) : (i64) -> !llvm.ptr + %73 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %74 = llvm.insertvalue %72, %73[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %75 = llvm.insertvalue %72, %74[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %76 = llvm.mlir.constant(0 : index) : i64 + %77 = llvm.insertvalue %76, %75[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %78 = llvm.insertvalue %65, %77[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %79 = llvm.insertvalue %66, %78[3, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %80 = llvm.insertvalue %66, %79[4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %81 = llvm.insertvalue %67, %80[4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %82 = llvm.mlir.constant(5 : index) : i64 + %83 = llvm.mlir.constant(5 : index) : i64 + %84 = llvm.mlir.constant(5 : index) : i64 + llvm.br ^bb1(%6 : i64) + ^bb1(%85: i64): // 2 preds: ^bb0, ^bb5 + %86 = llvm.icmp "slt" %85, %82 : i64 + llvm.cond_br %86, ^bb2, ^bb6 + ^bb2: // pred: ^bb1 + llvm.br ^bb3(%6 : i64) + ^bb3(%87: i64): // 2 preds: ^bb2, ^bb4 + %88 = llvm.icmp "slt" %87, %83 : i64 + llvm.cond_br %88, ^bb4, ^bb5 + ^bb4: // pred: ^bb3 + %89 = llvm.extractvalue %81[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %90 = llvm.mlir.constant(5 : index) : i64 + %91 = llvm.mul %85, %90 : i64 + %92 = llvm.add %91, %87 : i64 + %93 = llvm.getelementptr %89[%92] : (!llvm.ptr, i64) -> !llvm.ptr, i32 + llvm.store %5, %93 : i32, !llvm.ptr + %94 = llvm.add %87, %7 : i64 + llvm.br ^bb3(%94 : i64) + ^bb5: // pred: ^bb3 + %95 = llvm.add %85, %7 : i64 + llvm.br ^bb1(%95 : i64) + ^bb6: // pred: ^bb1 + %96 = llvm.extractvalue %27[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %97 = llvm.ptrtoint %96 : !llvm.ptr to i64 + %98 = llvm.extractvalue %47[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %99 = llvm.ptrtoint %98 : !llvm.ptr to i64 + %100 = llvm.extractvalue %64[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %101 = llvm.ptrtoint %100 : !llvm.ptr to i64 + %102 = llvm.extractvalue %81[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %103 = llvm.ptrtoint %102 : !llvm.ptr to i64 + %104 = llvm.mlir.constant(4575657221408489476 : i64) : i64 + %105 = llvm.mlir.constant(281474976710656 : i64) : i64 + "gemmini.intr.config_ex"(%104, %105) : (i64, i64) -> () + %106 = llvm.mlir.constant(5 : i64) : i64 + %107 = llvm.mlir.constant(2 : i64) : i64 + %108 = llvm.mlir.constant(4575657221408423941 : i64) : i64 + "gemmini.intr.config_st"(%107, %108) : (i64, i64) -> () + %109 = llvm.mlir.constant(5 : i64) : i64 + %110 = llvm.mlir.constant(4575657221409472769 : i64) : i64 + "gemmini.intr.config_ld"(%110, %109) : (i64, i64) -> () + %111 = llvm.mlir.constant(5 : i64) : i64 + %112 = llvm.mlir.constant(4575657221409472777 : i64) : i64 + "gemmini.intr.config_ld"(%112, %111) : (i64, i64) -> () + %113 = llvm.mlir.constant(20 : i64) : i64 + %114 = llvm.mlir.constant(4575657221409472785 : i64) : i64 + "gemmini.intr.config_ld"(%114, %113) : (i64, i64) -> () + %115 = llvm.mlir.constant(0 : i64) : i64 + %116 = llvm.mlir.constant(0 : i64) : i64 + %117 = llvm.mlir.constant(0 : i64) : i64 + %118 = llvm.mlir.constant(0 : i64) : i64 + %119 = llvm.mlir.constant(47245361163 : i64) : i64 + %120 = llvm.mlir.constant(4295032833 : i64) : i64 + "gemmini.intr.loop_ws_config_bounds"(%119, %120) : (i64, i64) -> () + "gemmini.intr.loop_ws_config_addrs_ab"(%97, %99) : (i64, i64) -> () + "gemmini.intr.loop_ws_config_addrs_dc"(%103, %101) : (i64, i64) -> () + %121 = llvm.mlir.constant(5 : i64) : i64 + %122 = llvm.mlir.constant(5 : i64) : i64 + "gemmini.intr.loop_ws_config_strides_ab"(%121, %122) : (i64, i64) -> () + %123 = llvm.mlir.constant(5 : i64) : i64 + %124 = llvm.mlir.constant(5 : i64) : i64 + "gemmini.intr.loop_ws_config_strides_dc"(%123, %124) : (i64, i64) -> () + %125 = llvm.mlir.constant(1 : i64) : i64 + %126 = llvm.mlir.constant(0 : i64) : i64 + "gemmini.intr.loop_ws"(%125, %126) : (i64, i64) -> () + %127 = llvm.mlir.constant(0 : i64) : i64 + "gemmini.intr.flush"(%127, %127) : (i64, i64) -> () + %128 = llvm.mlir.addressof @frmt_spec : !llvm.ptr + %129 = llvm.mlir.constant(0 : index) : i64 + %130 = llvm.getelementptr %128[%129, %129] : (!llvm.ptr, i64, i64) -> !llvm.ptr, !llvm.array<4 x i8> + %131 = llvm.mlir.addressof @nl : !llvm.ptr + %132 = llvm.mlir.constant(0 : index) : i64 + %133 = llvm.getelementptr %131[%132, %132] : (!llvm.ptr, i64, i64) -> !llvm.ptr, !llvm.array<2 x i8> + %134 = llvm.mlir.constant(0 : index) : i64 + %135 = llvm.mlir.constant(5 : index) : i64 + %136 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb7(%134 : i64) + ^bb7(%137: i64): // 2 preds: ^bb6, ^bb11 + %138 = llvm.icmp "slt" %137, %135 : i64 + llvm.cond_br %138, ^bb8, ^bb12 + ^bb8: // pred: ^bb7 + %139 = llvm.mlir.constant(0 : index) : i64 + %140 = llvm.mlir.constant(5 : index) : i64 + %141 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb9(%139 : i64) + ^bb9(%142: i64): // 2 preds: ^bb8, ^bb10 + %143 = llvm.icmp "slt" %142, %140 : i64 + llvm.cond_br %143, ^bb10, ^bb11 + ^bb10: // pred: ^bb9 + %144 = llvm.extractvalue %64[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %145 = llvm.mlir.constant(5 : index) : i64 + %146 = llvm.mul %137, %145 : i64 + %147 = llvm.add %146, %142 : i64 + %148 = llvm.getelementptr %144[%147] : (!llvm.ptr, i64) -> !llvm.ptr, i8 + %149 = llvm.load %148 : !llvm.ptr -> i8 + %150 = llvm.sext %149 : i8 to i32 + %151 = llvm.call @printf(%130, %150) vararg(!llvm.func) : (!llvm.ptr, i32) -> i32 + %152 = llvm.add %142, %141 : i64 + llvm.br ^bb9(%152 : i64) + ^bb11: // pred: ^bb9 + %153 = llvm.call @printf(%133) vararg(!llvm.func) : (!llvm.ptr) -> i32 + %154 = llvm.add %137, %136 : i64 + llvm.br ^bb7(%154 : i64) + ^bb12: // pred: ^bb7 + %155 = llvm.extractvalue %27[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %156 = llvm.ptrtoint %155 : !llvm.ptr to i64 + %157 = llvm.extractvalue %47[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %158 = llvm.ptrtoint %157 : !llvm.ptr to i64 + %159 = llvm.extractvalue %64[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %160 = llvm.ptrtoint %159 : !llvm.ptr to i64 + %161 = llvm.extractvalue %81[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %162 = llvm.ptrtoint %161 : !llvm.ptr to i64 + %163 = llvm.mlir.constant(4575657221408489476 : i64) : i64 + %164 = llvm.mlir.constant(281474976710656 : i64) : i64 + "gemmini.intr.config_ex"(%163, %164) : (i64, i64) -> () + %165 = llvm.mlir.constant(5 : i64) : i64 + %166 = llvm.mlir.constant(2 : i64) : i64 + %167 = llvm.mlir.constant(4575657221408423941 : i64) : i64 + "gemmini.intr.config_st"(%166, %167) : (i64, i64) -> () + %168 = llvm.mlir.constant(5 : i64) : i64 + %169 = llvm.mlir.constant(4575657221409472769 : i64) : i64 + "gemmini.intr.config_ld"(%169, %168) : (i64, i64) -> () + %170 = llvm.mlir.constant(5 : i64) : i64 + %171 = llvm.mlir.constant(4575657221409472777 : i64) : i64 + "gemmini.intr.config_ld"(%171, %170) : (i64, i64) -> () + %172 = llvm.mlir.constant(20 : i64) : i64 + %173 = llvm.mlir.constant(4575657221409472785 : i64) : i64 + "gemmini.intr.config_ld"(%173, %172) : (i64, i64) -> () + %174 = llvm.mlir.constant(55834640387 : i64) : i64 + %175 = llvm.mlir.constant(1644972474395 : i64) : i64 + "gemmini.intr.config_norm"(%174, %175) : (i64, i64) -> () + %176 = llvm.mlir.constant(21650930466819 : i64) : i64 + %177 = llvm.mlir.constant(1644972474395 : i64) : i64 + "gemmini.intr.config_norm"(%176, %177) : (i64, i64) -> () + %178 = llvm.mlir.constant(0 : i64) : i64 + %179 = llvm.mlir.constant(0 : i64) : i64 + %180 = llvm.mlir.constant(0 : i64) : i64 + %181 = llvm.mlir.constant(0 : i64) : i64 + %182 = llvm.mlir.constant(47245361163 : i64) : i64 + %183 = llvm.mlir.constant(4295032833 : i64) : i64 + "gemmini.intr.loop_ws_config_bounds"(%182, %183) : (i64, i64) -> () + "gemmini.intr.loop_ws_config_addrs_ab"(%156, %158) : (i64, i64) -> () + "gemmini.intr.loop_ws_config_addrs_dc"(%162, %160) : (i64, i64) -> () + %184 = llvm.mlir.constant(5 : i64) : i64 + %185 = llvm.mlir.constant(5 : i64) : i64 + "gemmini.intr.loop_ws_config_strides_ab"(%184, %185) : (i64, i64) -> () + %186 = llvm.mlir.constant(5 : i64) : i64 + %187 = llvm.mlir.constant(5 : i64) : i64 + "gemmini.intr.loop_ws_config_strides_dc"(%186, %187) : (i64, i64) -> () + %188 = llvm.mlir.constant(1025 : i64) : i64 + %189 = llvm.mlir.constant(0 : i64) : i64 + "gemmini.intr.loop_ws"(%188, %189) : (i64, i64) -> () + %190 = llvm.mlir.constant(0 : i64) : i64 + "gemmini.intr.flush"(%190, %190) : (i64, i64) -> () + %191 = llvm.mlir.addressof @frmt_spec : !llvm.ptr + %192 = llvm.mlir.constant(0 : index) : i64 + %193 = llvm.getelementptr %191[%192, %192] : (!llvm.ptr, i64, i64) -> !llvm.ptr, !llvm.array<4 x i8> + %194 = llvm.mlir.addressof @nl : !llvm.ptr + %195 = llvm.mlir.constant(0 : index) : i64 + %196 = llvm.getelementptr %194[%195, %195] : (!llvm.ptr, i64, i64) -> !llvm.ptr, !llvm.array<2 x i8> + %197 = llvm.mlir.constant(0 : index) : i64 + %198 = llvm.mlir.constant(5 : index) : i64 + %199 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb13(%197 : i64) + ^bb13(%200: i64): // 2 preds: ^bb12, ^bb17 + %201 = llvm.icmp "slt" %200, %198 : i64 + llvm.cond_br %201, ^bb14, ^bb18 + ^bb14: // pred: ^bb13 + %202 = llvm.mlir.constant(0 : index) : i64 + %203 = llvm.mlir.constant(5 : index) : i64 + %204 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb15(%202 : i64) + ^bb15(%205: i64): // 2 preds: ^bb14, ^bb16 + %206 = llvm.icmp "slt" %205, %203 : i64 + llvm.cond_br %206, ^bb16, ^bb17 + ^bb16: // pred: ^bb15 + %207 = llvm.extractvalue %64[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %208 = llvm.mlir.constant(5 : index) : i64 + %209 = llvm.mul %200, %208 : i64 + %210 = llvm.add %209, %205 : i64 + %211 = llvm.getelementptr %207[%210] : (!llvm.ptr, i64) -> !llvm.ptr, i8 + %212 = llvm.load %211 : !llvm.ptr -> i8 + %213 = llvm.sext %212 : i8 to i32 + %214 = llvm.call @printf(%193, %213) vararg(!llvm.func) : (!llvm.ptr, i32) -> i32 + %215 = llvm.add %205, %204 : i64 + llvm.br ^bb15(%215 : i64) + ^bb17: // pred: ^bb15 + %216 = llvm.call @printf(%196) vararg(!llvm.func) : (!llvm.ptr) -> i32 + %217 = llvm.add %200, %199 : i64 + llvm.br ^bb13(%217 : i64) + ^bb18: // pred: ^bb13 + llvm.return %0 : i8 + } +} + + From 3221999330ec41bc2907d92388995e51881071f8 Mon Sep 17 00:00:00 2001 From: sparsh Date: Tue, 9 Dec 2025 02:51:47 -0800 Subject: [PATCH 05/13] Document libgemmini build status on Mac --- experiments/gemmini/libgemmini_status.txt | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 experiments/gemmini/libgemmini_status.txt diff --git a/experiments/gemmini/libgemmini_status.txt b/experiments/gemmini/libgemmini_status.txt new file mode 100644 index 0000000..170f8b0 --- /dev/null +++ b/experiments/gemmini/libgemmini_status.txt @@ -0,0 +1,13 @@ +Mac setup notes: + +- buddy-mlir Gemmini lowering works (matmul, batch_matmul, conv, matmul+softmax). +- Generated LLVM IR (log.ll) and RISC-V asm (log.s) via Makefile targets, asm has Gemmini ops (config_ex, config_st, loop_ws, etc.). +- Spike + pk + riscv64-unknown-elf-gcc work for a simple "hello" test. + +Blocked on: +- Installing libgemmini (Spike extension) from https://github.com/ucb-bar/libgemmini. +- `make libgemmini.so` fails on macOS with `ld: symbol(s) not found for architecture arm64` and RISCV-dependent paths that assume a full Chipyard/Gemmini tree. + +Plan: +- Use Mac primarily for IR/pipeline experiments. +- Do Spike+Gemmini execution on a SLICE Linux machine with Chipyard/Gemmini installed. From 48ff9b047d622888fa240eedef5f40942ed7f4a7 Mon Sep 17 00:00:00 2001 From: sparsh Date: Thu, 11 Dec 2025 15:40:40 -0800 Subject: [PATCH 06/13] Buddy Gemmini: conv2d block lowering (NHWC x HWCF) --- .../logs/conv2d_block1.print-after-all.mlir | 693 ++++++++++++++++++ .../gemmini/networks/conv2d_block1.mlir | 13 + 2 files changed, 706 insertions(+) create mode 100644 experiments/gemmini/logs/conv2d_block1.print-after-all.mlir create mode 100644 experiments/gemmini/networks/conv2d_block1.mlir diff --git a/experiments/gemmini/logs/conv2d_block1.print-after-all.mlir b/experiments/gemmini/logs/conv2d_block1.print-after-all.mlir new file mode 100644 index 0000000..e22f289 --- /dev/null +++ b/experiments/gemmini/logs/conv2d_block1.print-after-all.mlir @@ -0,0 +1,693 @@ +// -----// IR Dump After (anonymous namespace)::LowerLinalgToGemminiPass (convert-linalg-to-gemmini) //----- // +module { + func.func @conv2d_block1(%arg0: memref<1x32x32x32xf16>, %arg1: memref<3x3x32x64xf16>, %arg2: memref<1x30x30x64xf32>) { + %alloc = memref.alloc() : memref<288x64xf16> + %alloc_0 = memref.alloc() : memref<900x64xf32> + %alloc_1 = memref.alloc() : memref<64xi32> + %c0_i32 = arith.constant 0 : i32 + linalg.fill ins(%c0_i32 : i32) outs(%alloc_1 : memref<64xi32>) + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c3 = arith.constant 3 : index + scf.for %arg3 = %c0 to %c3 step %c1 { + %c3_3 = arith.constant 3 : index + scf.for %arg4 = %c0 to %c3_3 step %c1 { + %c32 = arith.constant 32 : index + scf.for %arg5 = %c0 to %c32 step %c1 { + %c64 = arith.constant 64 : index + scf.for %arg6 = %c0 to %c64 step %c1 { + %c3_4 = arith.constant 3 : index + %c32_5 = arith.constant 32 : index + %0 = arith.muli %arg3, %c3_4 : index + %1 = arith.muli %0, %c32_5 : index + %2 = arith.muli %arg4, %c32_5 : index + %3 = arith.addi %1, %2 : index + %4 = arith.addi %3, %arg5 : index + %5 = memref.load %arg1[%arg3, %arg4, %arg5, %arg6] : memref<3x3x32x64xf16> + memref.store %5, %alloc[%4, %arg6] : memref<288x64xf16> + } + } + } + } + %c30_i64 = arith.constant 30 : i64 + %c3_i64 = arith.constant 3 : i64 + gemmini.tile_conv %arg0 %alloc %alloc_1 %alloc_0 %c30_i64 %c30_i64 %c3_i64 : memref<1x32x32x32xf16> memref<288x64xf16> memref<64xi32> memref<900x64xf32> i64 i64 i64 + %c1_2 = arith.constant 1 : index + scf.for %arg3 = %c0 to %c1_2 step %c1 { + %c30 = arith.constant 30 : index + scf.for %arg4 = %c0 to %c30 step %c1 { + %c30_3 = arith.constant 30 : index + scf.for %arg5 = %c0 to %c30_3 step %c1 { + %c64 = arith.constant 64 : index + scf.for %arg6 = %c0 to %c64 step %c1 { + %c30_4 = arith.constant 30 : index + %0 = arith.muli %arg3, %c30_4 : index + %1 = arith.muli %0, %c30_4 : index + %2 = arith.muli %c30_4, %arg4 : index + %3 = arith.addi %1, %2 : index + %4 = arith.addi %3, %arg5 : index + %5 = memref.load %alloc_0[%4, %arg6] : memref<900x64xf32> + memref.store %5, %arg2[%arg3, %arg4, %arg5, %arg6] : memref<1x30x30x64xf32> + } + } + } + } + memref.dealloc %alloc : memref<288x64xf16> + memref.dealloc %alloc_0 : memref<900x64xf32> + memref.dealloc %alloc_1 : memref<64xi32> + return + } +} + + +// -----// IR Dump After (anonymous namespace)::LowerGemminiToLLVMPass (lower-gemmini) //----- // +module { + llvm.func @free(!llvm.ptr) + llvm.func @malloc(i64) -> !llvm.ptr + llvm.func @conv2d_block1(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr, %arg23: !llvm.ptr, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64) { + %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1 = llvm.insertvalue %arg22, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %2 = llvm.insertvalue %arg23, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %3 = llvm.insertvalue %arg24, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %4 = llvm.insertvalue %arg25, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %5 = llvm.insertvalue %arg29, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %6 = llvm.insertvalue %arg26, %5[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %7 = llvm.insertvalue %arg30, %6[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %8 = llvm.insertvalue %arg27, %7[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %9 = llvm.insertvalue %arg31, %8[4, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %10 = llvm.insertvalue %arg28, %9[3, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %11 = llvm.insertvalue %arg32, %10[4, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %12 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %13 = llvm.insertvalue %arg0, %12[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %14 = llvm.insertvalue %arg1, %13[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %15 = llvm.insertvalue %arg2, %14[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %16 = llvm.insertvalue %arg3, %15[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %17 = llvm.insertvalue %arg7, %16[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %18 = llvm.insertvalue %arg4, %17[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %19 = llvm.insertvalue %arg8, %18[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %20 = llvm.insertvalue %arg5, %19[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %21 = llvm.insertvalue %arg9, %20[4, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %22 = llvm.insertvalue %arg6, %21[3, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %23 = llvm.insertvalue %arg10, %22[4, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %24 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %25 = llvm.insertvalue %arg11, %24[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %26 = llvm.insertvalue %arg12, %25[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %27 = llvm.insertvalue %arg13, %26[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %28 = llvm.insertvalue %arg14, %27[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %29 = llvm.insertvalue %arg18, %28[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %30 = llvm.insertvalue %arg15, %29[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %31 = llvm.insertvalue %arg19, %30[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %32 = llvm.insertvalue %arg16, %31[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %33 = llvm.insertvalue %arg20, %32[4, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %34 = llvm.insertvalue %arg17, %33[3, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %35 = llvm.insertvalue %arg21, %34[4, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %36 = llvm.mlir.constant(288 : index) : i64 + %37 = llvm.mlir.constant(64 : index) : i64 + %38 = llvm.mlir.constant(1 : index) : i64 + %39 = llvm.mlir.constant(18432 : index) : i64 + %40 = llvm.mlir.zero : !llvm.ptr + %41 = llvm.getelementptr %40[%39] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %42 = llvm.ptrtoint %41 : !llvm.ptr to i64 + %43 = llvm.call @malloc(%42) : (i64) -> !llvm.ptr + %44 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %45 = llvm.insertvalue %43, %44[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %46 = llvm.insertvalue %43, %45[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %47 = llvm.mlir.constant(0 : index) : i64 + %48 = llvm.insertvalue %47, %46[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %49 = llvm.insertvalue %36, %48[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %50 = llvm.insertvalue %37, %49[3, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %51 = llvm.insertvalue %37, %50[4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %52 = llvm.insertvalue %38, %51[4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %53 = llvm.mlir.constant(900 : index) : i64 + %54 = llvm.mlir.constant(64 : index) : i64 + %55 = llvm.mlir.constant(1 : index) : i64 + %56 = llvm.mlir.constant(57600 : index) : i64 + %57 = llvm.mlir.zero : !llvm.ptr + %58 = llvm.getelementptr %57[%56] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %59 = llvm.ptrtoint %58 : !llvm.ptr to i64 + %60 = llvm.call @malloc(%59) : (i64) -> !llvm.ptr + %61 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %62 = llvm.insertvalue %60, %61[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %63 = llvm.insertvalue %60, %62[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %64 = llvm.mlir.constant(0 : index) : i64 + %65 = llvm.insertvalue %64, %63[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %66 = llvm.insertvalue %53, %65[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %67 = llvm.insertvalue %54, %66[3, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %68 = llvm.insertvalue %54, %67[4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %69 = llvm.insertvalue %55, %68[4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %70 = llvm.mlir.constant(64 : index) : i64 + %71 = llvm.mlir.constant(1 : index) : i64 + %72 = llvm.mlir.zero : !llvm.ptr + %73 = llvm.getelementptr %72[%70] : (!llvm.ptr, i64) -> !llvm.ptr, i32 + %74 = llvm.ptrtoint %73 : !llvm.ptr to i64 + %75 = llvm.call @malloc(%74) : (i64) -> !llvm.ptr + %76 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + %77 = llvm.insertvalue %75, %76[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + %78 = llvm.insertvalue %75, %77[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + %79 = llvm.mlir.constant(0 : index) : i64 + %80 = llvm.insertvalue %79, %78[2] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + %81 = llvm.insertvalue %70, %80[3, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + %82 = llvm.insertvalue %71, %81[4, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + %83 = builtin.unrealized_conversion_cast %82 : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> to memref<64xi32> + %84 = llvm.mlir.constant(0 : i32) : i32 + linalg.fill ins(%84 : i32) outs(%83 : memref<64xi32>) + %85 = llvm.mlir.constant(0 : index) : i64 + %86 = llvm.mlir.constant(1 : index) : i64 + %87 = llvm.mlir.constant(3 : index) : i64 + llvm.br ^bb1(%85 : i64) + ^bb1(%88: i64): // 2 preds: ^bb0, ^bb11 + %89 = llvm.icmp "slt" %88, %87 : i64 + llvm.cond_br %89, ^bb2, ^bb12 + ^bb2: // pred: ^bb1 + %90 = llvm.mlir.constant(3 : index) : i64 + llvm.br ^bb3(%85 : i64) + ^bb3(%91: i64): // 2 preds: ^bb2, ^bb10 + %92 = llvm.icmp "slt" %91, %90 : i64 + llvm.cond_br %92, ^bb4, ^bb11 + ^bb4: // pred: ^bb3 + %93 = llvm.mlir.constant(32 : index) : i64 + llvm.br ^bb5(%85 : i64) + ^bb5(%94: i64): // 2 preds: ^bb4, ^bb9 + %95 = llvm.icmp "slt" %94, %93 : i64 + llvm.cond_br %95, ^bb6, ^bb10 + ^bb6: // pred: ^bb5 + %96 = llvm.mlir.constant(64 : index) : i64 + llvm.br ^bb7(%85 : i64) + ^bb7(%97: i64): // 2 preds: ^bb6, ^bb8 + %98 = llvm.icmp "slt" %97, %96 : i64 + llvm.cond_br %98, ^bb8, ^bb9 + ^bb8: // pred: ^bb7 + %99 = llvm.mlir.constant(3 : index) : i64 + %100 = llvm.mlir.constant(32 : index) : i64 + %101 = llvm.mul %88, %99 : i64 + %102 = llvm.mul %101, %100 : i64 + %103 = llvm.mul %91, %100 : i64 + %104 = llvm.add %102, %103 : i64 + %105 = llvm.add %104, %94 : i64 + %106 = llvm.extractvalue %35[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %107 = llvm.mlir.constant(6144 : index) : i64 + %108 = llvm.mul %88, %107 : i64 + %109 = llvm.mlir.constant(2048 : index) : i64 + %110 = llvm.mul %91, %109 : i64 + %111 = llvm.add %108, %110 : i64 + %112 = llvm.mlir.constant(64 : index) : i64 + %113 = llvm.mul %94, %112 : i64 + %114 = llvm.add %111, %113 : i64 + %115 = llvm.add %114, %97 : i64 + %116 = llvm.getelementptr %106[%115] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %117 = llvm.load %116 : !llvm.ptr -> f16 + %118 = llvm.extractvalue %52[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %119 = llvm.mlir.constant(64 : index) : i64 + %120 = llvm.mul %105, %119 : i64 + %121 = llvm.add %120, %97 : i64 + %122 = llvm.getelementptr %118[%121] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + llvm.store %117, %122 : f16, !llvm.ptr + %123 = llvm.add %97, %86 : i64 + llvm.br ^bb7(%123 : i64) + ^bb9: // pred: ^bb7 + %124 = llvm.add %94, %86 : i64 + llvm.br ^bb5(%124 : i64) + ^bb10: // pred: ^bb5 + %125 = llvm.add %91, %86 : i64 + llvm.br ^bb3(%125 : i64) + ^bb11: // pred: ^bb3 + %126 = llvm.add %88, %86 : i64 + llvm.br ^bb1(%126 : i64) + ^bb12: // pred: ^bb1 + %127 = llvm.mlir.constant(30 : i64) : i64 + %128 = llvm.mlir.constant(3 : i64) : i64 + %129 = llvm.extractvalue %23[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %130 = llvm.ptrtoint %129 : !llvm.ptr to i64 + %131 = llvm.extractvalue %69[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %132 = llvm.ptrtoint %131 : !llvm.ptr to i64 + %133 = llvm.extractvalue %82[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + %134 = llvm.ptrtoint %133 : !llvm.ptr to i64 + %135 = llvm.extractvalue %52[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %136 = llvm.ptrtoint %135 : !llvm.ptr to i64 + %137 = llvm.mlir.constant(64 : i64) : i64 + %138 = llvm.mlir.constant(2 : i64) : i64 + %139 = llvm.mlir.constant(4575657221408424000 : i64) : i64 + "gemmini.intr.config_st"(%138, %139) : (i64, i64) -> () + %140 = llvm.mlir.constant(65540 : i64) : i64 + %141 = llvm.mlir.constant(281474976710656 : i64) : i64 + "gemmini.intr.config_ex"(%140, %141) : (i64, i64) -> () + %142 = llvm.mlir.constant(0 : i64) : i64 + %143 = llvm.mlir.constant(0 : i64) : i64 + %144 = llvm.mlir.constant(0 : i64) : i64 + %145 = llvm.mlir.constant(0 : i64) : i64 + %146 = llvm.mlir.constant(18014535950532609 : i64) : i64 + %147 = llvm.mlir.constant(4296933406 : i64) : i64 + "gemmini.intr.loop_conv_ws_config1"(%146, %147) : (i64, i64) -> () + %148 = llvm.mlir.constant(844429225164800 : i64) : i64 + %149 = llvm.mlir.constant(281569467498512 : i64) : i64 + "gemmini.intr.loop_conv_ws_config2"(%148, %149) : (i64, i64) -> () + %150 = llvm.mlir.constant(844437817131008 : i64) : i64 + %151 = llvm.mlir.constant(0 : i64) : i64 + "gemmini.intr.loop_conv_ws_config3"(%150, %151) : (i64, i64) -> () + %152 = llvm.mlir.constant(6192449487634432 : i64) : i64 + %153 = llvm.mlir.constant(65559 : i64) : i64 + "gemmini.intr.loop_conv_ws_config4"(%152, %153) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config5"(%136, %132) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config6"(%134, %130) : (i64, i64) -> () + %154 = llvm.mlir.constant(256 : i64) : i64 + %155 = llvm.mlir.constant(1 : i64) : i64 + "gemmini.intr.loop_conv_ws"(%154, %155) : (i64, i64) -> () + %156 = llvm.mlir.constant(16 : i64) : i64 + %157 = llvm.add %132, %156 : i64 + %158 = llvm.mlir.constant(64 : i64) : i64 + %159 = llvm.add %134, %158 : i64 + %160 = llvm.mlir.constant(16 : i64) : i64 + %161 = llvm.add %136, %160 : i64 + %162 = llvm.mlir.constant(0 : i64) : i64 + %163 = llvm.mlir.constant(18014535950532609 : i64) : i64 + %164 = llvm.mlir.constant(4296933406 : i64) : i64 + "gemmini.intr.loop_conv_ws_config1"(%163, %164) : (i64, i64) -> () + %165 = llvm.mlir.constant(844429225164800 : i64) : i64 + %166 = llvm.mlir.constant(281569467498512 : i64) : i64 + "gemmini.intr.loop_conv_ws_config2"(%165, %166) : (i64, i64) -> () + %167 = llvm.mlir.constant(844437817131008 : i64) : i64 + %168 = llvm.mlir.constant(0 : i64) : i64 + "gemmini.intr.loop_conv_ws_config3"(%167, %168) : (i64, i64) -> () + %169 = llvm.mlir.constant(6192449487634432 : i64) : i64 + %170 = llvm.mlir.constant(65559 : i64) : i64 + "gemmini.intr.loop_conv_ws_config4"(%169, %170) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config5"(%161, %157) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config6"(%159, %130) : (i64, i64) -> () + %171 = llvm.mlir.constant(256 : i64) : i64 + %172 = llvm.mlir.constant(1 : i64) : i64 + "gemmini.intr.loop_conv_ws"(%171, %172) : (i64, i64) -> () + %173 = llvm.mlir.constant(32 : i64) : i64 + %174 = llvm.add %132, %173 : i64 + %175 = llvm.mlir.constant(128 : i64) : i64 + %176 = llvm.add %134, %175 : i64 + %177 = llvm.mlir.constant(32 : i64) : i64 + %178 = llvm.add %136, %177 : i64 + %179 = llvm.mlir.constant(0 : i64) : i64 + %180 = llvm.mlir.constant(18014535950532609 : i64) : i64 + %181 = llvm.mlir.constant(4296933406 : i64) : i64 + "gemmini.intr.loop_conv_ws_config1"(%180, %181) : (i64, i64) -> () + %182 = llvm.mlir.constant(844429225164800 : i64) : i64 + %183 = llvm.mlir.constant(281569467498512 : i64) : i64 + "gemmini.intr.loop_conv_ws_config2"(%182, %183) : (i64, i64) -> () + %184 = llvm.mlir.constant(844437817131008 : i64) : i64 + %185 = llvm.mlir.constant(0 : i64) : i64 + "gemmini.intr.loop_conv_ws_config3"(%184, %185) : (i64, i64) -> () + %186 = llvm.mlir.constant(6192449487634432 : i64) : i64 + %187 = llvm.mlir.constant(65559 : i64) : i64 + "gemmini.intr.loop_conv_ws_config4"(%186, %187) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config5"(%178, %174) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config6"(%176, %130) : (i64, i64) -> () + %188 = llvm.mlir.constant(256 : i64) : i64 + %189 = llvm.mlir.constant(1 : i64) : i64 + "gemmini.intr.loop_conv_ws"(%188, %189) : (i64, i64) -> () + %190 = llvm.mlir.constant(48 : i64) : i64 + %191 = llvm.add %132, %190 : i64 + %192 = llvm.mlir.constant(192 : i64) : i64 + %193 = llvm.add %134, %192 : i64 + %194 = llvm.mlir.constant(48 : i64) : i64 + %195 = llvm.add %136, %194 : i64 + %196 = llvm.mlir.constant(0 : i64) : i64 + %197 = llvm.mlir.constant(18014535950532609 : i64) : i64 + %198 = llvm.mlir.constant(4296933406 : i64) : i64 + "gemmini.intr.loop_conv_ws_config1"(%197, %198) : (i64, i64) -> () + %199 = llvm.mlir.constant(844429225164800 : i64) : i64 + %200 = llvm.mlir.constant(281569467498512 : i64) : i64 + "gemmini.intr.loop_conv_ws_config2"(%199, %200) : (i64, i64) -> () + %201 = llvm.mlir.constant(844437817131008 : i64) : i64 + %202 = llvm.mlir.constant(0 : i64) : i64 + "gemmini.intr.loop_conv_ws_config3"(%201, %202) : (i64, i64) -> () + %203 = llvm.mlir.constant(6192449487634432 : i64) : i64 + %204 = llvm.mlir.constant(65559 : i64) : i64 + "gemmini.intr.loop_conv_ws_config4"(%203, %204) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config5"(%195, %191) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config6"(%193, %130) : (i64, i64) -> () + %205 = llvm.mlir.constant(256 : i64) : i64 + %206 = llvm.mlir.constant(1 : i64) : i64 + "gemmini.intr.loop_conv_ws"(%205, %206) : (i64, i64) -> () + %207 = llvm.mlir.constant(1472 : i64) : i64 + %208 = llvm.add %132, %207 : i64 + %209 = llvm.mlir.constant(0 : i64) : i64 + %210 = llvm.mlir.constant(0 : i64) : i64 + %211 = llvm.mlir.constant(736 : i64) : i64 + %212 = llvm.add %130, %211 : i64 + %213 = llvm.mlir.constant(18014535950532609 : i64) : i64 + %214 = llvm.mlir.constant(4296933406 : i64) : i64 + "gemmini.intr.loop_conv_ws_config1"(%213, %214) : (i64, i64) -> () + %215 = llvm.mlir.constant(844429225164800 : i64) : i64 + %216 = llvm.mlir.constant(281569466449936 : i64) : i64 + "gemmini.intr.loop_conv_ws_config2"(%215, %216) : (i64, i64) -> () + %217 = llvm.mlir.constant(844437817131008 : i64) : i64 + %218 = llvm.mlir.constant(0 : i64) : i64 + "gemmini.intr.loop_conv_ws_config3"(%217, %218) : (i64, i64) -> () + %219 = llvm.mlir.constant(6192449487634432 : i64) : i64 + %220 = llvm.mlir.constant(65543 : i64) : i64 + "gemmini.intr.loop_conv_ws_config4"(%219, %220) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config5"(%136, %208) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config6"(%134, %212) : (i64, i64) -> () + %221 = llvm.mlir.constant(256 : i64) : i64 + %222 = llvm.mlir.constant(1 : i64) : i64 + "gemmini.intr.loop_conv_ws"(%221, %222) : (i64, i64) -> () + %223 = llvm.mlir.constant(1488 : i64) : i64 + %224 = llvm.add %132, %223 : i64 + %225 = llvm.mlir.constant(64 : i64) : i64 + %226 = llvm.add %134, %225 : i64 + %227 = llvm.mlir.constant(16 : i64) : i64 + %228 = llvm.add %136, %227 : i64 + %229 = llvm.mlir.constant(736 : i64) : i64 + %230 = llvm.add %130, %229 : i64 + %231 = llvm.mlir.constant(18014535950532609 : i64) : i64 + %232 = llvm.mlir.constant(4296933406 : i64) : i64 + "gemmini.intr.loop_conv_ws_config1"(%231, %232) : (i64, i64) -> () + %233 = llvm.mlir.constant(844429225164800 : i64) : i64 + %234 = llvm.mlir.constant(281569466449936 : i64) : i64 + "gemmini.intr.loop_conv_ws_config2"(%233, %234) : (i64, i64) -> () + %235 = llvm.mlir.constant(844437817131008 : i64) : i64 + %236 = llvm.mlir.constant(0 : i64) : i64 + "gemmini.intr.loop_conv_ws_config3"(%235, %236) : (i64, i64) -> () + %237 = llvm.mlir.constant(6192449487634432 : i64) : i64 + %238 = llvm.mlir.constant(65543 : i64) : i64 + "gemmini.intr.loop_conv_ws_config4"(%237, %238) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config5"(%228, %224) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config6"(%226, %230) : (i64, i64) -> () + %239 = llvm.mlir.constant(256 : i64) : i64 + %240 = llvm.mlir.constant(1 : i64) : i64 + "gemmini.intr.loop_conv_ws"(%239, %240) : (i64, i64) -> () + %241 = llvm.mlir.constant(1504 : i64) : i64 + %242 = llvm.add %132, %241 : i64 + %243 = llvm.mlir.constant(128 : i64) : i64 + %244 = llvm.add %134, %243 : i64 + %245 = llvm.mlir.constant(32 : i64) : i64 + %246 = llvm.add %136, %245 : i64 + %247 = llvm.mlir.constant(736 : i64) : i64 + %248 = llvm.add %130, %247 : i64 + %249 = llvm.mlir.constant(18014535950532609 : i64) : i64 + %250 = llvm.mlir.constant(4296933406 : i64) : i64 + "gemmini.intr.loop_conv_ws_config1"(%249, %250) : (i64, i64) -> () + %251 = llvm.mlir.constant(844429225164800 : i64) : i64 + %252 = llvm.mlir.constant(281569466449936 : i64) : i64 + "gemmini.intr.loop_conv_ws_config2"(%251, %252) : (i64, i64) -> () + %253 = llvm.mlir.constant(844437817131008 : i64) : i64 + %254 = llvm.mlir.constant(0 : i64) : i64 + "gemmini.intr.loop_conv_ws_config3"(%253, %254) : (i64, i64) -> () + %255 = llvm.mlir.constant(6192449487634432 : i64) : i64 + %256 = llvm.mlir.constant(65543 : i64) : i64 + "gemmini.intr.loop_conv_ws_config4"(%255, %256) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config5"(%246, %242) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config6"(%244, %248) : (i64, i64) -> () + %257 = llvm.mlir.constant(256 : i64) : i64 + %258 = llvm.mlir.constant(1 : i64) : i64 + "gemmini.intr.loop_conv_ws"(%257, %258) : (i64, i64) -> () + %259 = llvm.mlir.constant(1520 : i64) : i64 + %260 = llvm.add %132, %259 : i64 + %261 = llvm.mlir.constant(192 : i64) : i64 + %262 = llvm.add %134, %261 : i64 + %263 = llvm.mlir.constant(48 : i64) : i64 + %264 = llvm.add %136, %263 : i64 + %265 = llvm.mlir.constant(736 : i64) : i64 + %266 = llvm.add %130, %265 : i64 + %267 = llvm.mlir.constant(18014535950532609 : i64) : i64 + %268 = llvm.mlir.constant(4296933406 : i64) : i64 + "gemmini.intr.loop_conv_ws_config1"(%267, %268) : (i64, i64) -> () + %269 = llvm.mlir.constant(844429225164800 : i64) : i64 + %270 = llvm.mlir.constant(281569466449936 : i64) : i64 + "gemmini.intr.loop_conv_ws_config2"(%269, %270) : (i64, i64) -> () + %271 = llvm.mlir.constant(844437817131008 : i64) : i64 + %272 = llvm.mlir.constant(0 : i64) : i64 + "gemmini.intr.loop_conv_ws_config3"(%271, %272) : (i64, i64) -> () + %273 = llvm.mlir.constant(6192449487634432 : i64) : i64 + %274 = llvm.mlir.constant(65543 : i64) : i64 + "gemmini.intr.loop_conv_ws_config4"(%273, %274) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config5"(%264, %260) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config6"(%262, %266) : (i64, i64) -> () + %275 = llvm.mlir.constant(256 : i64) : i64 + %276 = llvm.mlir.constant(1 : i64) : i64 + "gemmini.intr.loop_conv_ws"(%275, %276) : (i64, i64) -> () + %277 = llvm.mlir.constant(42240 : i64) : i64 + %278 = llvm.add %132, %277 : i64 + %279 = llvm.mlir.constant(0 : i64) : i64 + %280 = llvm.mlir.constant(0 : i64) : i64 + %281 = llvm.mlir.constant(22528 : i64) : i64 + %282 = llvm.add %130, %281 : i64 + %283 = llvm.mlir.constant(18014535950532609 : i64) : i64 + %284 = llvm.mlir.constant(4296933406 : i64) : i64 + "gemmini.intr.loop_conv_ws_config1"(%283, %284) : (i64, i64) -> () + %285 = llvm.mlir.constant(844429225164800 : i64) : i64 + %286 = llvm.mlir.constant(281509337956368 : i64) : i64 + "gemmini.intr.loop_conv_ws_config2"(%285, %286) : (i64, i64) -> () + %287 = llvm.mlir.constant(844437817131008 : i64) : i64 + %288 = llvm.mlir.constant(0 : i64) : i64 + "gemmini.intr.loop_conv_ws_config3"(%287, %288) : (i64, i64) -> () + %289 = llvm.mlir.constant(2251799813685248 : i64) : i64 + %290 = llvm.mlir.constant(65559 : i64) : i64 + "gemmini.intr.loop_conv_ws_config4"(%289, %290) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config5"(%136, %278) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config6"(%134, %282) : (i64, i64) -> () + %291 = llvm.mlir.constant(256 : i64) : i64 + %292 = llvm.mlir.constant(1 : i64) : i64 + "gemmini.intr.loop_conv_ws"(%291, %292) : (i64, i64) -> () + %293 = llvm.mlir.constant(42256 : i64) : i64 + %294 = llvm.add %132, %293 : i64 + %295 = llvm.mlir.constant(64 : i64) : i64 + %296 = llvm.add %134, %295 : i64 + %297 = llvm.mlir.constant(16 : i64) : i64 + %298 = llvm.add %136, %297 : i64 + %299 = llvm.mlir.constant(22528 : i64) : i64 + %300 = llvm.add %130, %299 : i64 + %301 = llvm.mlir.constant(18014535950532609 : i64) : i64 + %302 = llvm.mlir.constant(4296933406 : i64) : i64 + "gemmini.intr.loop_conv_ws_config1"(%301, %302) : (i64, i64) -> () + %303 = llvm.mlir.constant(844429225164800 : i64) : i64 + %304 = llvm.mlir.constant(281509337956368 : i64) : i64 + "gemmini.intr.loop_conv_ws_config2"(%303, %304) : (i64, i64) -> () + %305 = llvm.mlir.constant(844437817131008 : i64) : i64 + %306 = llvm.mlir.constant(0 : i64) : i64 + "gemmini.intr.loop_conv_ws_config3"(%305, %306) : (i64, i64) -> () + %307 = llvm.mlir.constant(2251799813685248 : i64) : i64 + %308 = llvm.mlir.constant(65559 : i64) : i64 + "gemmini.intr.loop_conv_ws_config4"(%307, %308) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config5"(%298, %294) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config6"(%296, %300) : (i64, i64) -> () + %309 = llvm.mlir.constant(256 : i64) : i64 + %310 = llvm.mlir.constant(1 : i64) : i64 + "gemmini.intr.loop_conv_ws"(%309, %310) : (i64, i64) -> () + %311 = llvm.mlir.constant(42272 : i64) : i64 + %312 = llvm.add %132, %311 : i64 + %313 = llvm.mlir.constant(128 : i64) : i64 + %314 = llvm.add %134, %313 : i64 + %315 = llvm.mlir.constant(32 : i64) : i64 + %316 = llvm.add %136, %315 : i64 + %317 = llvm.mlir.constant(22528 : i64) : i64 + %318 = llvm.add %130, %317 : i64 + %319 = llvm.mlir.constant(18014535950532609 : i64) : i64 + %320 = llvm.mlir.constant(4296933406 : i64) : i64 + "gemmini.intr.loop_conv_ws_config1"(%319, %320) : (i64, i64) -> () + %321 = llvm.mlir.constant(844429225164800 : i64) : i64 + %322 = llvm.mlir.constant(281509337956368 : i64) : i64 + "gemmini.intr.loop_conv_ws_config2"(%321, %322) : (i64, i64) -> () + %323 = llvm.mlir.constant(844437817131008 : i64) : i64 + %324 = llvm.mlir.constant(0 : i64) : i64 + "gemmini.intr.loop_conv_ws_config3"(%323, %324) : (i64, i64) -> () + %325 = llvm.mlir.constant(2251799813685248 : i64) : i64 + %326 = llvm.mlir.constant(65559 : i64) : i64 + "gemmini.intr.loop_conv_ws_config4"(%325, %326) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config5"(%316, %312) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config6"(%314, %318) : (i64, i64) -> () + %327 = llvm.mlir.constant(256 : i64) : i64 + %328 = llvm.mlir.constant(1 : i64) : i64 + "gemmini.intr.loop_conv_ws"(%327, %328) : (i64, i64) -> () + %329 = llvm.mlir.constant(42288 : i64) : i64 + %330 = llvm.add %132, %329 : i64 + %331 = llvm.mlir.constant(192 : i64) : i64 + %332 = llvm.add %134, %331 : i64 + %333 = llvm.mlir.constant(48 : i64) : i64 + %334 = llvm.add %136, %333 : i64 + %335 = llvm.mlir.constant(22528 : i64) : i64 + %336 = llvm.add %130, %335 : i64 + %337 = llvm.mlir.constant(18014535950532609 : i64) : i64 + %338 = llvm.mlir.constant(4296933406 : i64) : i64 + "gemmini.intr.loop_conv_ws_config1"(%337, %338) : (i64, i64) -> () + %339 = llvm.mlir.constant(844429225164800 : i64) : i64 + %340 = llvm.mlir.constant(281509337956368 : i64) : i64 + "gemmini.intr.loop_conv_ws_config2"(%339, %340) : (i64, i64) -> () + %341 = llvm.mlir.constant(844437817131008 : i64) : i64 + %342 = llvm.mlir.constant(0 : i64) : i64 + "gemmini.intr.loop_conv_ws_config3"(%341, %342) : (i64, i64) -> () + %343 = llvm.mlir.constant(2251799813685248 : i64) : i64 + %344 = llvm.mlir.constant(65559 : i64) : i64 + "gemmini.intr.loop_conv_ws_config4"(%343, %344) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config5"(%334, %330) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config6"(%332, %336) : (i64, i64) -> () + %345 = llvm.mlir.constant(256 : i64) : i64 + %346 = llvm.mlir.constant(1 : i64) : i64 + "gemmini.intr.loop_conv_ws"(%345, %346) : (i64, i64) -> () + %347 = llvm.mlir.constant(43712 : i64) : i64 + %348 = llvm.add %132, %347 : i64 + %349 = llvm.mlir.constant(0 : i64) : i64 + %350 = llvm.mlir.constant(0 : i64) : i64 + %351 = llvm.mlir.constant(23264 : i64) : i64 + %352 = llvm.add %130, %351 : i64 + %353 = llvm.mlir.constant(18014535950532609 : i64) : i64 + %354 = llvm.mlir.constant(4296933406 : i64) : i64 + "gemmini.intr.loop_conv_ws_config1"(%353, %354) : (i64, i64) -> () + %355 = llvm.mlir.constant(844429225164800 : i64) : i64 + %356 = llvm.mlir.constant(281509336907792 : i64) : i64 + "gemmini.intr.loop_conv_ws_config2"(%355, %356) : (i64, i64) -> () + %357 = llvm.mlir.constant(844437817131008 : i64) : i64 + %358 = llvm.mlir.constant(0 : i64) : i64 + "gemmini.intr.loop_conv_ws_config3"(%357, %358) : (i64, i64) -> () + %359 = llvm.mlir.constant(2251799813685248 : i64) : i64 + %360 = llvm.mlir.constant(65543 : i64) : i64 + "gemmini.intr.loop_conv_ws_config4"(%359, %360) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config5"(%136, %348) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config6"(%134, %352) : (i64, i64) -> () + %361 = llvm.mlir.constant(256 : i64) : i64 + %362 = llvm.mlir.constant(1 : i64) : i64 + "gemmini.intr.loop_conv_ws"(%361, %362) : (i64, i64) -> () + %363 = llvm.mlir.constant(43728 : i64) : i64 + %364 = llvm.add %132, %363 : i64 + %365 = llvm.mlir.constant(64 : i64) : i64 + %366 = llvm.add %134, %365 : i64 + %367 = llvm.mlir.constant(16 : i64) : i64 + %368 = llvm.add %136, %367 : i64 + %369 = llvm.mlir.constant(23264 : i64) : i64 + %370 = llvm.add %130, %369 : i64 + %371 = llvm.mlir.constant(18014535950532609 : i64) : i64 + %372 = llvm.mlir.constant(4296933406 : i64) : i64 + "gemmini.intr.loop_conv_ws_config1"(%371, %372) : (i64, i64) -> () + %373 = llvm.mlir.constant(844429225164800 : i64) : i64 + %374 = llvm.mlir.constant(281509336907792 : i64) : i64 + "gemmini.intr.loop_conv_ws_config2"(%373, %374) : (i64, i64) -> () + %375 = llvm.mlir.constant(844437817131008 : i64) : i64 + %376 = llvm.mlir.constant(0 : i64) : i64 + "gemmini.intr.loop_conv_ws_config3"(%375, %376) : (i64, i64) -> () + %377 = llvm.mlir.constant(2251799813685248 : i64) : i64 + %378 = llvm.mlir.constant(65543 : i64) : i64 + "gemmini.intr.loop_conv_ws_config4"(%377, %378) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config5"(%368, %364) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config6"(%366, %370) : (i64, i64) -> () + %379 = llvm.mlir.constant(256 : i64) : i64 + %380 = llvm.mlir.constant(1 : i64) : i64 + "gemmini.intr.loop_conv_ws"(%379, %380) : (i64, i64) -> () + %381 = llvm.mlir.constant(43744 : i64) : i64 + %382 = llvm.add %132, %381 : i64 + %383 = llvm.mlir.constant(128 : i64) : i64 + %384 = llvm.add %134, %383 : i64 + %385 = llvm.mlir.constant(32 : i64) : i64 + %386 = llvm.add %136, %385 : i64 + %387 = llvm.mlir.constant(23264 : i64) : i64 + %388 = llvm.add %130, %387 : i64 + %389 = llvm.mlir.constant(18014535950532609 : i64) : i64 + %390 = llvm.mlir.constant(4296933406 : i64) : i64 + "gemmini.intr.loop_conv_ws_config1"(%389, %390) : (i64, i64) -> () + %391 = llvm.mlir.constant(844429225164800 : i64) : i64 + %392 = llvm.mlir.constant(281509336907792 : i64) : i64 + "gemmini.intr.loop_conv_ws_config2"(%391, %392) : (i64, i64) -> () + %393 = llvm.mlir.constant(844437817131008 : i64) : i64 + %394 = llvm.mlir.constant(0 : i64) : i64 + "gemmini.intr.loop_conv_ws_config3"(%393, %394) : (i64, i64) -> () + %395 = llvm.mlir.constant(2251799813685248 : i64) : i64 + %396 = llvm.mlir.constant(65543 : i64) : i64 + "gemmini.intr.loop_conv_ws_config4"(%395, %396) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config5"(%386, %382) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config6"(%384, %388) : (i64, i64) -> () + %397 = llvm.mlir.constant(256 : i64) : i64 + %398 = llvm.mlir.constant(1 : i64) : i64 + "gemmini.intr.loop_conv_ws"(%397, %398) : (i64, i64) -> () + %399 = llvm.mlir.constant(43760 : i64) : i64 + %400 = llvm.add %132, %399 : i64 + %401 = llvm.mlir.constant(192 : i64) : i64 + %402 = llvm.add %134, %401 : i64 + %403 = llvm.mlir.constant(48 : i64) : i64 + %404 = llvm.add %136, %403 : i64 + %405 = llvm.mlir.constant(23264 : i64) : i64 + %406 = llvm.add %130, %405 : i64 + %407 = llvm.mlir.constant(18014535950532609 : i64) : i64 + %408 = llvm.mlir.constant(4296933406 : i64) : i64 + "gemmini.intr.loop_conv_ws_config1"(%407, %408) : (i64, i64) -> () + %409 = llvm.mlir.constant(844429225164800 : i64) : i64 + %410 = llvm.mlir.constant(281509336907792 : i64) : i64 + "gemmini.intr.loop_conv_ws_config2"(%409, %410) : (i64, i64) -> () + %411 = llvm.mlir.constant(844437817131008 : i64) : i64 + %412 = llvm.mlir.constant(0 : i64) : i64 + "gemmini.intr.loop_conv_ws_config3"(%411, %412) : (i64, i64) -> () + %413 = llvm.mlir.constant(2251799813685248 : i64) : i64 + %414 = llvm.mlir.constant(65543 : i64) : i64 + "gemmini.intr.loop_conv_ws_config4"(%413, %414) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config5"(%404, %400) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config6"(%402, %406) : (i64, i64) -> () + %415 = llvm.mlir.constant(256 : i64) : i64 + %416 = llvm.mlir.constant(1 : i64) : i64 + "gemmini.intr.loop_conv_ws"(%415, %416) : (i64, i64) -> () + %417 = llvm.mlir.constant(0 : i64) : i64 + "gemmini.intr.flush"(%417, %417) : (i64, i64) -> () + %418 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb13(%85 : i64) + ^bb13(%419: i64): // 2 preds: ^bb12, ^bb23 + %420 = llvm.icmp "slt" %419, %418 : i64 + llvm.cond_br %420, ^bb14, ^bb24 + ^bb14: // pred: ^bb13 + %421 = llvm.mlir.constant(30 : index) : i64 + llvm.br ^bb15(%85 : i64) + ^bb15(%422: i64): // 2 preds: ^bb14, ^bb22 + %423 = llvm.icmp "slt" %422, %421 : i64 + llvm.cond_br %423, ^bb16, ^bb23 + ^bb16: // pred: ^bb15 + %424 = llvm.mlir.constant(30 : index) : i64 + llvm.br ^bb17(%85 : i64) + ^bb17(%425: i64): // 2 preds: ^bb16, ^bb21 + %426 = llvm.icmp "slt" %425, %424 : i64 + llvm.cond_br %426, ^bb18, ^bb22 + ^bb18: // pred: ^bb17 + %427 = llvm.mlir.constant(64 : index) : i64 + llvm.br ^bb19(%85 : i64) + ^bb19(%428: i64): // 2 preds: ^bb18, ^bb20 + %429 = llvm.icmp "slt" %428, %427 : i64 + llvm.cond_br %429, ^bb20, ^bb21 + ^bb20: // pred: ^bb19 + %430 = llvm.mlir.constant(30 : index) : i64 + %431 = llvm.mul %419, %430 : i64 + %432 = llvm.mul %431, %430 : i64 + %433 = llvm.mul %422, %430 : i64 + %434 = llvm.add %432, %433 : i64 + %435 = llvm.add %434, %425 : i64 + %436 = llvm.extractvalue %69[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %437 = llvm.mlir.constant(64 : index) : i64 + %438 = llvm.mul %435, %437 : i64 + %439 = llvm.add %438, %428 : i64 + %440 = llvm.getelementptr %436[%439] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %441 = llvm.load %440 : !llvm.ptr -> f32 + %442 = llvm.extractvalue %11[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %443 = llvm.mlir.constant(57600 : index) : i64 + %444 = llvm.mul %419, %443 : i64 + %445 = llvm.mlir.constant(1920 : index) : i64 + %446 = llvm.mul %422, %445 : i64 + %447 = llvm.add %444, %446 : i64 + %448 = llvm.mlir.constant(64 : index) : i64 + %449 = llvm.mul %425, %448 : i64 + %450 = llvm.add %447, %449 : i64 + %451 = llvm.add %450, %428 : i64 + %452 = llvm.getelementptr %442[%451] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.store %441, %452 : f32, !llvm.ptr + %453 = llvm.add %428, %86 : i64 + llvm.br ^bb19(%453 : i64) + ^bb21: // pred: ^bb19 + %454 = llvm.add %425, %86 : i64 + llvm.br ^bb17(%454 : i64) + ^bb22: // pred: ^bb17 + %455 = llvm.add %422, %86 : i64 + llvm.br ^bb15(%455 : i64) + ^bb23: // pred: ^bb15 + %456 = llvm.add %419, %86 : i64 + llvm.br ^bb13(%456 : i64) + ^bb24: // pred: ^bb13 + %457 = llvm.extractvalue %52[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + llvm.call @free(%457) : (!llvm.ptr) -> () + %458 = llvm.extractvalue %69[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + llvm.call @free(%458) : (!llvm.ptr) -> () + %459 = llvm.extractvalue %82[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + llvm.call @free(%459) : (!llvm.ptr) -> () + llvm.return + } +} + + diff --git a/experiments/gemmini/networks/conv2d_block1.mlir b/experiments/gemmini/networks/conv2d_block1.mlir new file mode 100644 index 0000000..32a6ba7 --- /dev/null +++ b/experiments/gemmini/networks/conv2d_block1.mlir @@ -0,0 +1,13 @@ +module { + func.func @conv2d_block1( + %input: memref<1x32x32x32xf16>, // [N,H,W,C_in] + %filter: memref<3x3x32x64xf16>, // [KH,KW,C_in,C_out] + %output: memref<1x30x30x64xf32> // [N,H_out,W_out,C_out] + ) { + // linalg conv2d in NHWC x HWCF + linalg.conv_2d_nhwc_hwcf + ins(%input, %filter : memref<1x32x32x32xf16>, memref<3x3x32x64xf16>) + outs(%output : memref<1x30x30x64xf32>) + return + } +} From 860d350b12c0fba77b8a262d886ef975f3ad1251 Mon Sep 17 00:00:00 2001 From: sparsh Date: Thu, 18 Dec 2025 01:11:07 -0800 Subject: [PATCH 07/13] Buddy Gemmini: add NCHW conv2d block lowering test --- .../conv2d_block_nchw.print-after-all.mlir | 835 ++++++++++++++++++ .../gemmini/networks/conv2d_block_nchw.mlir | 17 + 2 files changed, 852 insertions(+) create mode 100644 experiments/gemmini/logs/conv2d_block_nchw.print-after-all.mlir create mode 100644 experiments/gemmini/networks/conv2d_block_nchw.mlir diff --git a/experiments/gemmini/logs/conv2d_block_nchw.print-after-all.mlir b/experiments/gemmini/logs/conv2d_block_nchw.print-after-all.mlir new file mode 100644 index 0000000..fba1d9e --- /dev/null +++ b/experiments/gemmini/logs/conv2d_block_nchw.print-after-all.mlir @@ -0,0 +1,835 @@ +// -----// IR Dump After (anonymous namespace)::LowerLinalgToGemminiPass (convert-linalg-to-gemmini) //----- // +module { + func.func @conv2d_block_nchw(%arg0: memref<1x3x32x32xf32>, %arg1: memref<64x3x3x3xf32>, %arg2: memref<1x64x30x30xf32>) { + %alloc = memref.alloc() : memref<1x32x32x3xf32> + %alloc_0 = memref.alloc() : memref<27x64xf32> + %alloc_1 = memref.alloc() : memref<64xi32> + %alloc_2 = memref.alloc() : memref<900x64xf32> + %c30_i64 = arith.constant 30 : i64 + %c3 = arith.constant 3 : index + %c3_3 = arith.constant 3 : index + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c1_4 = arith.constant 1 : index + scf.for %arg3 = %c0 to %c1 step %c1_4 { + %c0_10 = arith.constant 0 : index + %c3_11 = arith.constant 3 : index + %c1_12 = arith.constant 1 : index + scf.for %arg4 = %c0_10 to %c3_11 step %c1_12 { + %c0_13 = arith.constant 0 : index + %c32 = arith.constant 32 : index + %c1_14 = arith.constant 1 : index + scf.for %arg5 = %c0_13 to %c32 step %c1_14 { + %c0_15 = arith.constant 0 : index + %c32_16 = arith.constant 32 : index + %c1_17 = arith.constant 1 : index + scf.for %arg6 = %c0_15 to %c32_16 step %c1_17 { + %0 = memref.load %arg0[%arg3, %arg4, %arg5, %arg6] : memref<1x3x32x32xf32> + memref.store %0, %alloc[%arg3, %arg5, %arg6, %arg4] : memref<1x32x32x3xf32> + } + } + } + } + %c0_5 = arith.constant 0 : index + %c64 = arith.constant 64 : index + %c1_6 = arith.constant 1 : index + scf.for %arg3 = %c0_5 to %c64 step %c1_6 { + %c0_10 = arith.constant 0 : index + %c3_11 = arith.constant 3 : index + %c1_12 = arith.constant 1 : index + scf.for %arg4 = %c0_10 to %c3_11 step %c1_12 { + %c0_13 = arith.constant 0 : index + %c3_14 = arith.constant 3 : index + %c1_15 = arith.constant 1 : index + scf.for %arg5 = %c0_13 to %c3_14 step %c1_15 { + %c0_16 = arith.constant 0 : index + %c3_17 = arith.constant 3 : index + %c1_18 = arith.constant 1 : index + scf.for %arg6 = %c0_16 to %c3_17 step %c1_18 { + %0 = arith.muli %arg5, %c3 : index + %1 = arith.muli %0, %c3_3 : index + %2 = arith.muli %arg6, %c3_3 : index + %3 = arith.addi %1, %2 : index + %4 = arith.addi %3, %arg4 : index + %5 = memref.load %arg1[%arg3, %arg4, %arg5, %arg6] : memref<64x3x3x3xf32> + memref.store %5, %alloc_0[%4, %arg3] : memref<27x64xf32> + } + } + } + } + %c3_i64 = arith.constant 3 : i64 + gemmini.tile_conv %alloc %alloc_0 %alloc_1 %alloc_2 %c30_i64 %c30_i64 %c3_i64 : memref<1x32x32x3xf32> memref<27x64xf32> memref<64xi32> memref<900x64xf32> i64 i64 i64 + %c0_7 = arith.constant 0 : index + %c1_8 = arith.constant 1 : index + %c1_9 = arith.constant 1 : index + scf.for %arg3 = %c0_7 to %c1_8 step %c1_9 { + %c0_10 = arith.constant 0 : index + %c64_11 = arith.constant 64 : index + %c1_12 = arith.constant 1 : index + scf.for %arg4 = %c0_10 to %c64_11 step %c1_12 { + %c0_13 = arith.constant 0 : index + %c30 = arith.constant 30 : index + %c1_14 = arith.constant 1 : index + scf.for %arg5 = %c0_13 to %c30 step %c1_14 { + %c0_15 = arith.constant 0 : index + %c30_16 = arith.constant 30 : index + %c1_17 = arith.constant 1 : index + scf.for %arg6 = %c0_15 to %c30_16 step %c1_17 { + %c30_18 = arith.constant 30 : index + %0 = arith.muli %arg3, %c30_18 : index + %1 = arith.muli %0, %c30_18 : index + %2 = arith.muli %arg5, %c30_18 : index + %3 = arith.addi %1, %2 : index + %4 = arith.addi %3, %arg6 : index + %5 = memref.load %alloc_2[%4, %arg4] : memref<900x64xf32> + memref.store %5, %arg2[%arg3, %arg4, %arg5, %arg6] : memref<1x64x30x30xf32> + } + } + } + } + memref.dealloc %alloc : memref<1x32x32x3xf32> + memref.dealloc %alloc_0 : memref<27x64xf32> + memref.dealloc %alloc_2 : memref<900x64xf32> + memref.dealloc %alloc_1 : memref<64xi32> + return + } +} + + +// -----// IR Dump After (anonymous namespace)::LowerGemminiToLLVMPass (lower-gemmini) //----- // +module { + llvm.func @free(!llvm.ptr) + llvm.func @malloc(i64) -> !llvm.ptr + llvm.func @conv2d_block_nchw(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr, %arg23: !llvm.ptr, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64) { + %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1 = llvm.insertvalue %arg22, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %2 = llvm.insertvalue %arg23, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %3 = llvm.insertvalue %arg24, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %4 = llvm.insertvalue %arg25, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %5 = llvm.insertvalue %arg29, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %6 = llvm.insertvalue %arg26, %5[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %7 = llvm.insertvalue %arg30, %6[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %8 = llvm.insertvalue %arg27, %7[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %9 = llvm.insertvalue %arg31, %8[4, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %10 = llvm.insertvalue %arg28, %9[3, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %11 = llvm.insertvalue %arg32, %10[4, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %12 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %13 = llvm.insertvalue %arg11, %12[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %14 = llvm.insertvalue %arg12, %13[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %15 = llvm.insertvalue %arg13, %14[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %16 = llvm.insertvalue %arg14, %15[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %17 = llvm.insertvalue %arg18, %16[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %18 = llvm.insertvalue %arg15, %17[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %19 = llvm.insertvalue %arg19, %18[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %20 = llvm.insertvalue %arg16, %19[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %21 = llvm.insertvalue %arg20, %20[4, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %22 = llvm.insertvalue %arg17, %21[3, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %23 = llvm.insertvalue %arg21, %22[4, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %24 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %25 = llvm.insertvalue %arg0, %24[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %26 = llvm.insertvalue %arg1, %25[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %27 = llvm.insertvalue %arg2, %26[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %28 = llvm.insertvalue %arg3, %27[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %29 = llvm.insertvalue %arg7, %28[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %30 = llvm.insertvalue %arg4, %29[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %31 = llvm.insertvalue %arg8, %30[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %32 = llvm.insertvalue %arg5, %31[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %33 = llvm.insertvalue %arg9, %32[4, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %34 = llvm.insertvalue %arg6, %33[3, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %35 = llvm.insertvalue %arg10, %34[4, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %36 = llvm.mlir.constant(1 : index) : i64 + %37 = llvm.mlir.constant(32 : index) : i64 + %38 = llvm.mlir.constant(32 : index) : i64 + %39 = llvm.mlir.constant(3 : index) : i64 + %40 = llvm.mlir.constant(1 : index) : i64 + %41 = llvm.mlir.constant(96 : index) : i64 + %42 = llvm.mlir.constant(3072 : index) : i64 + %43 = llvm.mlir.constant(3072 : index) : i64 + %44 = llvm.mlir.zero : !llvm.ptr + %45 = llvm.getelementptr %44[%43] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %46 = llvm.ptrtoint %45 : !llvm.ptr to i64 + %47 = llvm.call @malloc(%46) : (i64) -> !llvm.ptr + %48 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %49 = llvm.insertvalue %47, %48[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %50 = llvm.insertvalue %47, %49[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %51 = llvm.mlir.constant(0 : index) : i64 + %52 = llvm.insertvalue %51, %50[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %53 = llvm.insertvalue %36, %52[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %54 = llvm.insertvalue %37, %53[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %55 = llvm.insertvalue %38, %54[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %56 = llvm.insertvalue %39, %55[3, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %57 = llvm.insertvalue %42, %56[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %58 = llvm.insertvalue %41, %57[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %59 = llvm.insertvalue %39, %58[4, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %60 = llvm.insertvalue %40, %59[4, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %61 = llvm.mlir.constant(27 : index) : i64 + %62 = llvm.mlir.constant(64 : index) : i64 + %63 = llvm.mlir.constant(1 : index) : i64 + %64 = llvm.mlir.constant(1728 : index) : i64 + %65 = llvm.mlir.zero : !llvm.ptr + %66 = llvm.getelementptr %65[%64] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %67 = llvm.ptrtoint %66 : !llvm.ptr to i64 + %68 = llvm.call @malloc(%67) : (i64) -> !llvm.ptr + %69 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %70 = llvm.insertvalue %68, %69[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %71 = llvm.insertvalue %68, %70[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %72 = llvm.mlir.constant(0 : index) : i64 + %73 = llvm.insertvalue %72, %71[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %74 = llvm.insertvalue %61, %73[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %75 = llvm.insertvalue %62, %74[3, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %76 = llvm.insertvalue %62, %75[4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %77 = llvm.insertvalue %63, %76[4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %78 = llvm.mlir.constant(64 : index) : i64 + %79 = llvm.mlir.constant(1 : index) : i64 + %80 = llvm.mlir.zero : !llvm.ptr + %81 = llvm.getelementptr %80[%78] : (!llvm.ptr, i64) -> !llvm.ptr, i32 + %82 = llvm.ptrtoint %81 : !llvm.ptr to i64 + %83 = llvm.call @malloc(%82) : (i64) -> !llvm.ptr + %84 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + %85 = llvm.insertvalue %83, %84[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + %86 = llvm.insertvalue %83, %85[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + %87 = llvm.mlir.constant(0 : index) : i64 + %88 = llvm.insertvalue %87, %86[2] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + %89 = llvm.insertvalue %78, %88[3, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + %90 = llvm.insertvalue %79, %89[4, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + %91 = llvm.mlir.constant(900 : index) : i64 + %92 = llvm.mlir.constant(64 : index) : i64 + %93 = llvm.mlir.constant(1 : index) : i64 + %94 = llvm.mlir.constant(57600 : index) : i64 + %95 = llvm.mlir.zero : !llvm.ptr + %96 = llvm.getelementptr %95[%94] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %97 = llvm.ptrtoint %96 : !llvm.ptr to i64 + %98 = llvm.call @malloc(%97) : (i64) -> !llvm.ptr + %99 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %100 = llvm.insertvalue %98, %99[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %101 = llvm.insertvalue %98, %100[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %102 = llvm.mlir.constant(0 : index) : i64 + %103 = llvm.insertvalue %102, %101[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %104 = llvm.insertvalue %91, %103[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %105 = llvm.insertvalue %92, %104[3, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %106 = llvm.insertvalue %92, %105[4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %107 = llvm.insertvalue %93, %106[4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %108 = llvm.mlir.constant(30 : i64) : i64 + %109 = llvm.mlir.constant(3 : index) : i64 + %110 = llvm.mlir.constant(3 : index) : i64 + %111 = llvm.mlir.constant(0 : index) : i64 + %112 = llvm.mlir.constant(1 : index) : i64 + %113 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb1(%111 : i64) + ^bb1(%114: i64): // 2 preds: ^bb0, ^bb11 + %115 = llvm.icmp "slt" %114, %112 : i64 + llvm.cond_br %115, ^bb2, ^bb12 + ^bb2: // pred: ^bb1 + %116 = llvm.mlir.constant(0 : index) : i64 + %117 = llvm.mlir.constant(3 : index) : i64 + %118 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb3(%116 : i64) + ^bb3(%119: i64): // 2 preds: ^bb2, ^bb10 + %120 = llvm.icmp "slt" %119, %117 : i64 + llvm.cond_br %120, ^bb4, ^bb11 + ^bb4: // pred: ^bb3 + %121 = llvm.mlir.constant(0 : index) : i64 + %122 = llvm.mlir.constant(32 : index) : i64 + %123 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb5(%121 : i64) + ^bb5(%124: i64): // 2 preds: ^bb4, ^bb9 + %125 = llvm.icmp "slt" %124, %122 : i64 + llvm.cond_br %125, ^bb6, ^bb10 + ^bb6: // pred: ^bb5 + %126 = llvm.mlir.constant(0 : index) : i64 + %127 = llvm.mlir.constant(32 : index) : i64 + %128 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb7(%126 : i64) + ^bb7(%129: i64): // 2 preds: ^bb6, ^bb8 + %130 = llvm.icmp "slt" %129, %127 : i64 + llvm.cond_br %130, ^bb8, ^bb9 + ^bb8: // pred: ^bb7 + %131 = llvm.extractvalue %35[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %132 = llvm.mlir.constant(3072 : index) : i64 + %133 = llvm.mul %114, %132 : i64 + %134 = llvm.mlir.constant(1024 : index) : i64 + %135 = llvm.mul %119, %134 : i64 + %136 = llvm.add %133, %135 : i64 + %137 = llvm.mlir.constant(32 : index) : i64 + %138 = llvm.mul %124, %137 : i64 + %139 = llvm.add %136, %138 : i64 + %140 = llvm.add %139, %129 : i64 + %141 = llvm.getelementptr %131[%140] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %142 = llvm.load %141 : !llvm.ptr -> f32 + %143 = llvm.extractvalue %60[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %144 = llvm.mlir.constant(3072 : index) : i64 + %145 = llvm.mul %114, %144 : i64 + %146 = llvm.mlir.constant(96 : index) : i64 + %147 = llvm.mul %124, %146 : i64 + %148 = llvm.add %145, %147 : i64 + %149 = llvm.mlir.constant(3 : index) : i64 + %150 = llvm.mul %129, %149 : i64 + %151 = llvm.add %148, %150 : i64 + %152 = llvm.add %151, %119 : i64 + %153 = llvm.getelementptr %143[%152] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.store %142, %153 : f32, !llvm.ptr + %154 = llvm.add %129, %128 : i64 + llvm.br ^bb7(%154 : i64) + ^bb9: // pred: ^bb7 + %155 = llvm.add %124, %123 : i64 + llvm.br ^bb5(%155 : i64) + ^bb10: // pred: ^bb5 + %156 = llvm.add %119, %118 : i64 + llvm.br ^bb3(%156 : i64) + ^bb11: // pred: ^bb3 + %157 = llvm.add %114, %113 : i64 + llvm.br ^bb1(%157 : i64) + ^bb12: // pred: ^bb1 + %158 = llvm.mlir.constant(0 : index) : i64 + %159 = llvm.mlir.constant(64 : index) : i64 + %160 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb13(%158 : i64) + ^bb13(%161: i64): // 2 preds: ^bb12, ^bb23 + %162 = llvm.icmp "slt" %161, %159 : i64 + llvm.cond_br %162, ^bb14, ^bb24 + ^bb14: // pred: ^bb13 + %163 = llvm.mlir.constant(0 : index) : i64 + %164 = llvm.mlir.constant(3 : index) : i64 + %165 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb15(%163 : i64) + ^bb15(%166: i64): // 2 preds: ^bb14, ^bb22 + %167 = llvm.icmp "slt" %166, %164 : i64 + llvm.cond_br %167, ^bb16, ^bb23 + ^bb16: // pred: ^bb15 + %168 = llvm.mlir.constant(0 : index) : i64 + %169 = llvm.mlir.constant(3 : index) : i64 + %170 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb17(%168 : i64) + ^bb17(%171: i64): // 2 preds: ^bb16, ^bb21 + %172 = llvm.icmp "slt" %171, %169 : i64 + llvm.cond_br %172, ^bb18, ^bb22 + ^bb18: // pred: ^bb17 + %173 = llvm.mlir.constant(0 : index) : i64 + %174 = llvm.mlir.constant(3 : index) : i64 + %175 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb19(%173 : i64) + ^bb19(%176: i64): // 2 preds: ^bb18, ^bb20 + %177 = llvm.icmp "slt" %176, %174 : i64 + llvm.cond_br %177, ^bb20, ^bb21 + ^bb20: // pred: ^bb19 + %178 = llvm.mul %171, %109 : i64 + %179 = llvm.mul %178, %110 : i64 + %180 = llvm.mul %176, %110 : i64 + %181 = llvm.add %179, %180 : i64 + %182 = llvm.add %181, %166 : i64 + %183 = llvm.extractvalue %23[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %184 = llvm.mlir.constant(27 : index) : i64 + %185 = llvm.mul %161, %184 : i64 + %186 = llvm.mlir.constant(9 : index) : i64 + %187 = llvm.mul %166, %186 : i64 + %188 = llvm.add %185, %187 : i64 + %189 = llvm.mlir.constant(3 : index) : i64 + %190 = llvm.mul %171, %189 : i64 + %191 = llvm.add %188, %190 : i64 + %192 = llvm.add %191, %176 : i64 + %193 = llvm.getelementptr %183[%192] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %194 = llvm.load %193 : !llvm.ptr -> f32 + %195 = llvm.extractvalue %77[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %196 = llvm.mlir.constant(64 : index) : i64 + %197 = llvm.mul %182, %196 : i64 + %198 = llvm.add %197, %161 : i64 + %199 = llvm.getelementptr %195[%198] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.store %194, %199 : f32, !llvm.ptr + %200 = llvm.add %176, %175 : i64 + llvm.br ^bb19(%200 : i64) + ^bb21: // pred: ^bb19 + %201 = llvm.add %171, %170 : i64 + llvm.br ^bb17(%201 : i64) + ^bb22: // pred: ^bb17 + %202 = llvm.add %166, %165 : i64 + llvm.br ^bb15(%202 : i64) + ^bb23: // pred: ^bb15 + %203 = llvm.add %161, %160 : i64 + llvm.br ^bb13(%203 : i64) + ^bb24: // pred: ^bb13 + %204 = llvm.mlir.constant(3 : i64) : i64 + %205 = llvm.extractvalue %60[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %206 = llvm.ptrtoint %205 : !llvm.ptr to i64 + %207 = llvm.extractvalue %107[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %208 = llvm.ptrtoint %207 : !llvm.ptr to i64 + %209 = llvm.extractvalue %90[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + %210 = llvm.ptrtoint %209 : !llvm.ptr to i64 + %211 = llvm.extractvalue %77[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %212 = llvm.ptrtoint %211 : !llvm.ptr to i64 + %213 = llvm.mlir.constant(64 : i64) : i64 + %214 = llvm.mlir.constant(2 : i64) : i64 + %215 = llvm.mlir.constant(4575657221408424000 : i64) : i64 + "gemmini.intr.config_st"(%214, %215) : (i64, i64) -> () + %216 = llvm.mlir.constant(65540 : i64) : i64 + %217 = llvm.mlir.constant(281474976710656 : i64) : i64 + "gemmini.intr.config_ex"(%216, %217) : (i64, i64) -> () + %218 = llvm.mlir.constant(0 : i64) : i64 + %219 = llvm.mlir.constant(0 : i64) : i64 + %220 = llvm.mlir.constant(0 : i64) : i64 + %221 = llvm.mlir.constant(0 : i64) : i64 + %222 = llvm.mlir.constant(18014411396481025 : i64) : i64 + %223 = llvm.mlir.constant(4296933406 : i64) : i64 + "gemmini.intr.loop_conv_ws_config1"(%222, %223) : (i64, i64) -> () + %224 = llvm.mlir.constant(844429225164800 : i64) : i64 + %225 = llvm.mlir.constant(281569467498512 : i64) : i64 + "gemmini.intr.loop_conv_ws_config2"(%224, %225) : (i64, i64) -> () + %226 = llvm.mlir.constant(844437815230464 : i64) : i64 + %227 = llvm.mlir.constant(0 : i64) : i64 + "gemmini.intr.loop_conv_ws_config3"(%226, %227) : (i64, i64) -> () + %228 = llvm.mlir.constant(6192449487634432 : i64) : i64 + %229 = llvm.mlir.constant(65559 : i64) : i64 + "gemmini.intr.loop_conv_ws_config4"(%228, %229) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config5"(%212, %208) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config6"(%210, %206) : (i64, i64) -> () + %230 = llvm.mlir.constant(768 : i64) : i64 + %231 = llvm.mlir.constant(1 : i64) : i64 + "gemmini.intr.loop_conv_ws"(%230, %231) : (i64, i64) -> () + %232 = llvm.mlir.constant(16 : i64) : i64 + %233 = llvm.add %208, %232 : i64 + %234 = llvm.mlir.constant(64 : i64) : i64 + %235 = llvm.add %210, %234 : i64 + %236 = llvm.mlir.constant(16 : i64) : i64 + %237 = llvm.add %212, %236 : i64 + %238 = llvm.mlir.constant(0 : i64) : i64 + %239 = llvm.mlir.constant(18014411396481025 : i64) : i64 + %240 = llvm.mlir.constant(4296933406 : i64) : i64 + "gemmini.intr.loop_conv_ws_config1"(%239, %240) : (i64, i64) -> () + %241 = llvm.mlir.constant(844429225164800 : i64) : i64 + %242 = llvm.mlir.constant(281569467498512 : i64) : i64 + "gemmini.intr.loop_conv_ws_config2"(%241, %242) : (i64, i64) -> () + %243 = llvm.mlir.constant(844437815230464 : i64) : i64 + %244 = llvm.mlir.constant(0 : i64) : i64 + "gemmini.intr.loop_conv_ws_config3"(%243, %244) : (i64, i64) -> () + %245 = llvm.mlir.constant(6192449487634432 : i64) : i64 + %246 = llvm.mlir.constant(65559 : i64) : i64 + "gemmini.intr.loop_conv_ws_config4"(%245, %246) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config5"(%237, %233) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config6"(%235, %206) : (i64, i64) -> () + %247 = llvm.mlir.constant(768 : i64) : i64 + %248 = llvm.mlir.constant(1 : i64) : i64 + "gemmini.intr.loop_conv_ws"(%247, %248) : (i64, i64) -> () + %249 = llvm.mlir.constant(32 : i64) : i64 + %250 = llvm.add %208, %249 : i64 + %251 = llvm.mlir.constant(128 : i64) : i64 + %252 = llvm.add %210, %251 : i64 + %253 = llvm.mlir.constant(32 : i64) : i64 + %254 = llvm.add %212, %253 : i64 + %255 = llvm.mlir.constant(0 : i64) : i64 + %256 = llvm.mlir.constant(18014411396481025 : i64) : i64 + %257 = llvm.mlir.constant(4296933406 : i64) : i64 + "gemmini.intr.loop_conv_ws_config1"(%256, %257) : (i64, i64) -> () + %258 = llvm.mlir.constant(844429225164800 : i64) : i64 + %259 = llvm.mlir.constant(281569467498512 : i64) : i64 + "gemmini.intr.loop_conv_ws_config2"(%258, %259) : (i64, i64) -> () + %260 = llvm.mlir.constant(844437815230464 : i64) : i64 + %261 = llvm.mlir.constant(0 : i64) : i64 + "gemmini.intr.loop_conv_ws_config3"(%260, %261) : (i64, i64) -> () + %262 = llvm.mlir.constant(6192449487634432 : i64) : i64 + %263 = llvm.mlir.constant(65559 : i64) : i64 + "gemmini.intr.loop_conv_ws_config4"(%262, %263) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config5"(%254, %250) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config6"(%252, %206) : (i64, i64) -> () + %264 = llvm.mlir.constant(768 : i64) : i64 + %265 = llvm.mlir.constant(1 : i64) : i64 + "gemmini.intr.loop_conv_ws"(%264, %265) : (i64, i64) -> () + %266 = llvm.mlir.constant(48 : i64) : i64 + %267 = llvm.add %208, %266 : i64 + %268 = llvm.mlir.constant(192 : i64) : i64 + %269 = llvm.add %210, %268 : i64 + %270 = llvm.mlir.constant(48 : i64) : i64 + %271 = llvm.add %212, %270 : i64 + %272 = llvm.mlir.constant(0 : i64) : i64 + %273 = llvm.mlir.constant(18014411396481025 : i64) : i64 + %274 = llvm.mlir.constant(4296933406 : i64) : i64 + "gemmini.intr.loop_conv_ws_config1"(%273, %274) : (i64, i64) -> () + %275 = llvm.mlir.constant(844429225164800 : i64) : i64 + %276 = llvm.mlir.constant(281569467498512 : i64) : i64 + "gemmini.intr.loop_conv_ws_config2"(%275, %276) : (i64, i64) -> () + %277 = llvm.mlir.constant(844437815230464 : i64) : i64 + %278 = llvm.mlir.constant(0 : i64) : i64 + "gemmini.intr.loop_conv_ws_config3"(%277, %278) : (i64, i64) -> () + %279 = llvm.mlir.constant(6192449487634432 : i64) : i64 + %280 = llvm.mlir.constant(65559 : i64) : i64 + "gemmini.intr.loop_conv_ws_config4"(%279, %280) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config5"(%271, %267) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config6"(%269, %206) : (i64, i64) -> () + %281 = llvm.mlir.constant(768 : i64) : i64 + %282 = llvm.mlir.constant(1 : i64) : i64 + "gemmini.intr.loop_conv_ws"(%281, %282) : (i64, i64) -> () + %283 = llvm.mlir.constant(1472 : i64) : i64 + %284 = llvm.add %208, %283 : i64 + %285 = llvm.mlir.constant(0 : i64) : i64 + %286 = llvm.mlir.constant(0 : i64) : i64 + %287 = llvm.mlir.constant(69 : i64) : i64 + %288 = llvm.add %206, %287 : i64 + %289 = llvm.mlir.constant(18014411396481025 : i64) : i64 + %290 = llvm.mlir.constant(4296933406 : i64) : i64 + "gemmini.intr.loop_conv_ws_config1"(%289, %290) : (i64, i64) -> () + %291 = llvm.mlir.constant(844429225164800 : i64) : i64 + %292 = llvm.mlir.constant(281569466449936 : i64) : i64 + "gemmini.intr.loop_conv_ws_config2"(%291, %292) : (i64, i64) -> () + %293 = llvm.mlir.constant(844437815230464 : i64) : i64 + %294 = llvm.mlir.constant(0 : i64) : i64 + "gemmini.intr.loop_conv_ws_config3"(%293, %294) : (i64, i64) -> () + %295 = llvm.mlir.constant(6192449487634432 : i64) : i64 + %296 = llvm.mlir.constant(65543 : i64) : i64 + "gemmini.intr.loop_conv_ws_config4"(%295, %296) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config5"(%212, %284) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config6"(%210, %288) : (i64, i64) -> () + %297 = llvm.mlir.constant(768 : i64) : i64 + %298 = llvm.mlir.constant(1 : i64) : i64 + "gemmini.intr.loop_conv_ws"(%297, %298) : (i64, i64) -> () + %299 = llvm.mlir.constant(1488 : i64) : i64 + %300 = llvm.add %208, %299 : i64 + %301 = llvm.mlir.constant(64 : i64) : i64 + %302 = llvm.add %210, %301 : i64 + %303 = llvm.mlir.constant(16 : i64) : i64 + %304 = llvm.add %212, %303 : i64 + %305 = llvm.mlir.constant(69 : i64) : i64 + %306 = llvm.add %206, %305 : i64 + %307 = llvm.mlir.constant(18014411396481025 : i64) : i64 + %308 = llvm.mlir.constant(4296933406 : i64) : i64 + "gemmini.intr.loop_conv_ws_config1"(%307, %308) : (i64, i64) -> () + %309 = llvm.mlir.constant(844429225164800 : i64) : i64 + %310 = llvm.mlir.constant(281569466449936 : i64) : i64 + "gemmini.intr.loop_conv_ws_config2"(%309, %310) : (i64, i64) -> () + %311 = llvm.mlir.constant(844437815230464 : i64) : i64 + %312 = llvm.mlir.constant(0 : i64) : i64 + "gemmini.intr.loop_conv_ws_config3"(%311, %312) : (i64, i64) -> () + %313 = llvm.mlir.constant(6192449487634432 : i64) : i64 + %314 = llvm.mlir.constant(65543 : i64) : i64 + "gemmini.intr.loop_conv_ws_config4"(%313, %314) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config5"(%304, %300) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config6"(%302, %306) : (i64, i64) -> () + %315 = llvm.mlir.constant(768 : i64) : i64 + %316 = llvm.mlir.constant(1 : i64) : i64 + "gemmini.intr.loop_conv_ws"(%315, %316) : (i64, i64) -> () + %317 = llvm.mlir.constant(1504 : i64) : i64 + %318 = llvm.add %208, %317 : i64 + %319 = llvm.mlir.constant(128 : i64) : i64 + %320 = llvm.add %210, %319 : i64 + %321 = llvm.mlir.constant(32 : i64) : i64 + %322 = llvm.add %212, %321 : i64 + %323 = llvm.mlir.constant(69 : i64) : i64 + %324 = llvm.add %206, %323 : i64 + %325 = llvm.mlir.constant(18014411396481025 : i64) : i64 + %326 = llvm.mlir.constant(4296933406 : i64) : i64 + "gemmini.intr.loop_conv_ws_config1"(%325, %326) : (i64, i64) -> () + %327 = llvm.mlir.constant(844429225164800 : i64) : i64 + %328 = llvm.mlir.constant(281569466449936 : i64) : i64 + "gemmini.intr.loop_conv_ws_config2"(%327, %328) : (i64, i64) -> () + %329 = llvm.mlir.constant(844437815230464 : i64) : i64 + %330 = llvm.mlir.constant(0 : i64) : i64 + "gemmini.intr.loop_conv_ws_config3"(%329, %330) : (i64, i64) -> () + %331 = llvm.mlir.constant(6192449487634432 : i64) : i64 + %332 = llvm.mlir.constant(65543 : i64) : i64 + "gemmini.intr.loop_conv_ws_config4"(%331, %332) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config5"(%322, %318) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config6"(%320, %324) : (i64, i64) -> () + %333 = llvm.mlir.constant(768 : i64) : i64 + %334 = llvm.mlir.constant(1 : i64) : i64 + "gemmini.intr.loop_conv_ws"(%333, %334) : (i64, i64) -> () + %335 = llvm.mlir.constant(1520 : i64) : i64 + %336 = llvm.add %208, %335 : i64 + %337 = llvm.mlir.constant(192 : i64) : i64 + %338 = llvm.add %210, %337 : i64 + %339 = llvm.mlir.constant(48 : i64) : i64 + %340 = llvm.add %212, %339 : i64 + %341 = llvm.mlir.constant(69 : i64) : i64 + %342 = llvm.add %206, %341 : i64 + %343 = llvm.mlir.constant(18014411396481025 : i64) : i64 + %344 = llvm.mlir.constant(4296933406 : i64) : i64 + "gemmini.intr.loop_conv_ws_config1"(%343, %344) : (i64, i64) -> () + %345 = llvm.mlir.constant(844429225164800 : i64) : i64 + %346 = llvm.mlir.constant(281569466449936 : i64) : i64 + "gemmini.intr.loop_conv_ws_config2"(%345, %346) : (i64, i64) -> () + %347 = llvm.mlir.constant(844437815230464 : i64) : i64 + %348 = llvm.mlir.constant(0 : i64) : i64 + "gemmini.intr.loop_conv_ws_config3"(%347, %348) : (i64, i64) -> () + %349 = llvm.mlir.constant(6192449487634432 : i64) : i64 + %350 = llvm.mlir.constant(65543 : i64) : i64 + "gemmini.intr.loop_conv_ws_config4"(%349, %350) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config5"(%340, %336) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config6"(%338, %342) : (i64, i64) -> () + %351 = llvm.mlir.constant(768 : i64) : i64 + %352 = llvm.mlir.constant(1 : i64) : i64 + "gemmini.intr.loop_conv_ws"(%351, %352) : (i64, i64) -> () + %353 = llvm.mlir.constant(42240 : i64) : i64 + %354 = llvm.add %208, %353 : i64 + %355 = llvm.mlir.constant(0 : i64) : i64 + %356 = llvm.mlir.constant(0 : i64) : i64 + %357 = llvm.mlir.constant(2112 : i64) : i64 + %358 = llvm.add %206, %357 : i64 + %359 = llvm.mlir.constant(18014411396481025 : i64) : i64 + %360 = llvm.mlir.constant(4296933406 : i64) : i64 + "gemmini.intr.loop_conv_ws_config1"(%359, %360) : (i64, i64) -> () + %361 = llvm.mlir.constant(844429225164800 : i64) : i64 + %362 = llvm.mlir.constant(281509337956368 : i64) : i64 + "gemmini.intr.loop_conv_ws_config2"(%361, %362) : (i64, i64) -> () + %363 = llvm.mlir.constant(844437815230464 : i64) : i64 + %364 = llvm.mlir.constant(0 : i64) : i64 + "gemmini.intr.loop_conv_ws_config3"(%363, %364) : (i64, i64) -> () + %365 = llvm.mlir.constant(2251799813685248 : i64) : i64 + %366 = llvm.mlir.constant(65559 : i64) : i64 + "gemmini.intr.loop_conv_ws_config4"(%365, %366) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config5"(%212, %354) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config6"(%210, %358) : (i64, i64) -> () + %367 = llvm.mlir.constant(768 : i64) : i64 + %368 = llvm.mlir.constant(1 : i64) : i64 + "gemmini.intr.loop_conv_ws"(%367, %368) : (i64, i64) -> () + %369 = llvm.mlir.constant(42256 : i64) : i64 + %370 = llvm.add %208, %369 : i64 + %371 = llvm.mlir.constant(64 : i64) : i64 + %372 = llvm.add %210, %371 : i64 + %373 = llvm.mlir.constant(16 : i64) : i64 + %374 = llvm.add %212, %373 : i64 + %375 = llvm.mlir.constant(2112 : i64) : i64 + %376 = llvm.add %206, %375 : i64 + %377 = llvm.mlir.constant(18014411396481025 : i64) : i64 + %378 = llvm.mlir.constant(4296933406 : i64) : i64 + "gemmini.intr.loop_conv_ws_config1"(%377, %378) : (i64, i64) -> () + %379 = llvm.mlir.constant(844429225164800 : i64) : i64 + %380 = llvm.mlir.constant(281509337956368 : i64) : i64 + "gemmini.intr.loop_conv_ws_config2"(%379, %380) : (i64, i64) -> () + %381 = llvm.mlir.constant(844437815230464 : i64) : i64 + %382 = llvm.mlir.constant(0 : i64) : i64 + "gemmini.intr.loop_conv_ws_config3"(%381, %382) : (i64, i64) -> () + %383 = llvm.mlir.constant(2251799813685248 : i64) : i64 + %384 = llvm.mlir.constant(65559 : i64) : i64 + "gemmini.intr.loop_conv_ws_config4"(%383, %384) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config5"(%374, %370) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config6"(%372, %376) : (i64, i64) -> () + %385 = llvm.mlir.constant(768 : i64) : i64 + %386 = llvm.mlir.constant(1 : i64) : i64 + "gemmini.intr.loop_conv_ws"(%385, %386) : (i64, i64) -> () + %387 = llvm.mlir.constant(42272 : i64) : i64 + %388 = llvm.add %208, %387 : i64 + %389 = llvm.mlir.constant(128 : i64) : i64 + %390 = llvm.add %210, %389 : i64 + %391 = llvm.mlir.constant(32 : i64) : i64 + %392 = llvm.add %212, %391 : i64 + %393 = llvm.mlir.constant(2112 : i64) : i64 + %394 = llvm.add %206, %393 : i64 + %395 = llvm.mlir.constant(18014411396481025 : i64) : i64 + %396 = llvm.mlir.constant(4296933406 : i64) : i64 + "gemmini.intr.loop_conv_ws_config1"(%395, %396) : (i64, i64) -> () + %397 = llvm.mlir.constant(844429225164800 : i64) : i64 + %398 = llvm.mlir.constant(281509337956368 : i64) : i64 + "gemmini.intr.loop_conv_ws_config2"(%397, %398) : (i64, i64) -> () + %399 = llvm.mlir.constant(844437815230464 : i64) : i64 + %400 = llvm.mlir.constant(0 : i64) : i64 + "gemmini.intr.loop_conv_ws_config3"(%399, %400) : (i64, i64) -> () + %401 = llvm.mlir.constant(2251799813685248 : i64) : i64 + %402 = llvm.mlir.constant(65559 : i64) : i64 + "gemmini.intr.loop_conv_ws_config4"(%401, %402) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config5"(%392, %388) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config6"(%390, %394) : (i64, i64) -> () + %403 = llvm.mlir.constant(768 : i64) : i64 + %404 = llvm.mlir.constant(1 : i64) : i64 + "gemmini.intr.loop_conv_ws"(%403, %404) : (i64, i64) -> () + %405 = llvm.mlir.constant(42288 : i64) : i64 + %406 = llvm.add %208, %405 : i64 + %407 = llvm.mlir.constant(192 : i64) : i64 + %408 = llvm.add %210, %407 : i64 + %409 = llvm.mlir.constant(48 : i64) : i64 + %410 = llvm.add %212, %409 : i64 + %411 = llvm.mlir.constant(2112 : i64) : i64 + %412 = llvm.add %206, %411 : i64 + %413 = llvm.mlir.constant(18014411396481025 : i64) : i64 + %414 = llvm.mlir.constant(4296933406 : i64) : i64 + "gemmini.intr.loop_conv_ws_config1"(%413, %414) : (i64, i64) -> () + %415 = llvm.mlir.constant(844429225164800 : i64) : i64 + %416 = llvm.mlir.constant(281509337956368 : i64) : i64 + "gemmini.intr.loop_conv_ws_config2"(%415, %416) : (i64, i64) -> () + %417 = llvm.mlir.constant(844437815230464 : i64) : i64 + %418 = llvm.mlir.constant(0 : i64) : i64 + "gemmini.intr.loop_conv_ws_config3"(%417, %418) : (i64, i64) -> () + %419 = llvm.mlir.constant(2251799813685248 : i64) : i64 + %420 = llvm.mlir.constant(65559 : i64) : i64 + "gemmini.intr.loop_conv_ws_config4"(%419, %420) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config5"(%410, %406) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config6"(%408, %412) : (i64, i64) -> () + %421 = llvm.mlir.constant(768 : i64) : i64 + %422 = llvm.mlir.constant(1 : i64) : i64 + "gemmini.intr.loop_conv_ws"(%421, %422) : (i64, i64) -> () + %423 = llvm.mlir.constant(43712 : i64) : i64 + %424 = llvm.add %208, %423 : i64 + %425 = llvm.mlir.constant(0 : i64) : i64 + %426 = llvm.mlir.constant(0 : i64) : i64 + %427 = llvm.mlir.constant(2181 : i64) : i64 + %428 = llvm.add %206, %427 : i64 + %429 = llvm.mlir.constant(18014411396481025 : i64) : i64 + %430 = llvm.mlir.constant(4296933406 : i64) : i64 + "gemmini.intr.loop_conv_ws_config1"(%429, %430) : (i64, i64) -> () + %431 = llvm.mlir.constant(844429225164800 : i64) : i64 + %432 = llvm.mlir.constant(281509336907792 : i64) : i64 + "gemmini.intr.loop_conv_ws_config2"(%431, %432) : (i64, i64) -> () + %433 = llvm.mlir.constant(844437815230464 : i64) : i64 + %434 = llvm.mlir.constant(0 : i64) : i64 + "gemmini.intr.loop_conv_ws_config3"(%433, %434) : (i64, i64) -> () + %435 = llvm.mlir.constant(2251799813685248 : i64) : i64 + %436 = llvm.mlir.constant(65543 : i64) : i64 + "gemmini.intr.loop_conv_ws_config4"(%435, %436) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config5"(%212, %424) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config6"(%210, %428) : (i64, i64) -> () + %437 = llvm.mlir.constant(768 : i64) : i64 + %438 = llvm.mlir.constant(1 : i64) : i64 + "gemmini.intr.loop_conv_ws"(%437, %438) : (i64, i64) -> () + %439 = llvm.mlir.constant(43728 : i64) : i64 + %440 = llvm.add %208, %439 : i64 + %441 = llvm.mlir.constant(64 : i64) : i64 + %442 = llvm.add %210, %441 : i64 + %443 = llvm.mlir.constant(16 : i64) : i64 + %444 = llvm.add %212, %443 : i64 + %445 = llvm.mlir.constant(2181 : i64) : i64 + %446 = llvm.add %206, %445 : i64 + %447 = llvm.mlir.constant(18014411396481025 : i64) : i64 + %448 = llvm.mlir.constant(4296933406 : i64) : i64 + "gemmini.intr.loop_conv_ws_config1"(%447, %448) : (i64, i64) -> () + %449 = llvm.mlir.constant(844429225164800 : i64) : i64 + %450 = llvm.mlir.constant(281509336907792 : i64) : i64 + "gemmini.intr.loop_conv_ws_config2"(%449, %450) : (i64, i64) -> () + %451 = llvm.mlir.constant(844437815230464 : i64) : i64 + %452 = llvm.mlir.constant(0 : i64) : i64 + "gemmini.intr.loop_conv_ws_config3"(%451, %452) : (i64, i64) -> () + %453 = llvm.mlir.constant(2251799813685248 : i64) : i64 + %454 = llvm.mlir.constant(65543 : i64) : i64 + "gemmini.intr.loop_conv_ws_config4"(%453, %454) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config5"(%444, %440) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config6"(%442, %446) : (i64, i64) -> () + %455 = llvm.mlir.constant(768 : i64) : i64 + %456 = llvm.mlir.constant(1 : i64) : i64 + "gemmini.intr.loop_conv_ws"(%455, %456) : (i64, i64) -> () + %457 = llvm.mlir.constant(43744 : i64) : i64 + %458 = llvm.add %208, %457 : i64 + %459 = llvm.mlir.constant(128 : i64) : i64 + %460 = llvm.add %210, %459 : i64 + %461 = llvm.mlir.constant(32 : i64) : i64 + %462 = llvm.add %212, %461 : i64 + %463 = llvm.mlir.constant(2181 : i64) : i64 + %464 = llvm.add %206, %463 : i64 + %465 = llvm.mlir.constant(18014411396481025 : i64) : i64 + %466 = llvm.mlir.constant(4296933406 : i64) : i64 + "gemmini.intr.loop_conv_ws_config1"(%465, %466) : (i64, i64) -> () + %467 = llvm.mlir.constant(844429225164800 : i64) : i64 + %468 = llvm.mlir.constant(281509336907792 : i64) : i64 + "gemmini.intr.loop_conv_ws_config2"(%467, %468) : (i64, i64) -> () + %469 = llvm.mlir.constant(844437815230464 : i64) : i64 + %470 = llvm.mlir.constant(0 : i64) : i64 + "gemmini.intr.loop_conv_ws_config3"(%469, %470) : (i64, i64) -> () + %471 = llvm.mlir.constant(2251799813685248 : i64) : i64 + %472 = llvm.mlir.constant(65543 : i64) : i64 + "gemmini.intr.loop_conv_ws_config4"(%471, %472) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config5"(%462, %458) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config6"(%460, %464) : (i64, i64) -> () + %473 = llvm.mlir.constant(768 : i64) : i64 + %474 = llvm.mlir.constant(1 : i64) : i64 + "gemmini.intr.loop_conv_ws"(%473, %474) : (i64, i64) -> () + %475 = llvm.mlir.constant(43760 : i64) : i64 + %476 = llvm.add %208, %475 : i64 + %477 = llvm.mlir.constant(192 : i64) : i64 + %478 = llvm.add %210, %477 : i64 + %479 = llvm.mlir.constant(48 : i64) : i64 + %480 = llvm.add %212, %479 : i64 + %481 = llvm.mlir.constant(2181 : i64) : i64 + %482 = llvm.add %206, %481 : i64 + %483 = llvm.mlir.constant(18014411396481025 : i64) : i64 + %484 = llvm.mlir.constant(4296933406 : i64) : i64 + "gemmini.intr.loop_conv_ws_config1"(%483, %484) : (i64, i64) -> () + %485 = llvm.mlir.constant(844429225164800 : i64) : i64 + %486 = llvm.mlir.constant(281509336907792 : i64) : i64 + "gemmini.intr.loop_conv_ws_config2"(%485, %486) : (i64, i64) -> () + %487 = llvm.mlir.constant(844437815230464 : i64) : i64 + %488 = llvm.mlir.constant(0 : i64) : i64 + "gemmini.intr.loop_conv_ws_config3"(%487, %488) : (i64, i64) -> () + %489 = llvm.mlir.constant(2251799813685248 : i64) : i64 + %490 = llvm.mlir.constant(65543 : i64) : i64 + "gemmini.intr.loop_conv_ws_config4"(%489, %490) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config5"(%480, %476) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config6"(%478, %482) : (i64, i64) -> () + %491 = llvm.mlir.constant(768 : i64) : i64 + %492 = llvm.mlir.constant(1 : i64) : i64 + "gemmini.intr.loop_conv_ws"(%491, %492) : (i64, i64) -> () + %493 = llvm.mlir.constant(0 : i64) : i64 + "gemmini.intr.flush"(%493, %493) : (i64, i64) -> () + %494 = llvm.mlir.constant(0 : index) : i64 + %495 = llvm.mlir.constant(1 : index) : i64 + %496 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb25(%494 : i64) + ^bb25(%497: i64): // 2 preds: ^bb24, ^bb35 + %498 = llvm.icmp "slt" %497, %495 : i64 + llvm.cond_br %498, ^bb26, ^bb36 + ^bb26: // pred: ^bb25 + %499 = llvm.mlir.constant(0 : index) : i64 + %500 = llvm.mlir.constant(64 : index) : i64 + %501 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb27(%499 : i64) + ^bb27(%502: i64): // 2 preds: ^bb26, ^bb34 + %503 = llvm.icmp "slt" %502, %500 : i64 + llvm.cond_br %503, ^bb28, ^bb35 + ^bb28: // pred: ^bb27 + %504 = llvm.mlir.constant(0 : index) : i64 + %505 = llvm.mlir.constant(30 : index) : i64 + %506 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb29(%504 : i64) + ^bb29(%507: i64): // 2 preds: ^bb28, ^bb33 + %508 = llvm.icmp "slt" %507, %505 : i64 + llvm.cond_br %508, ^bb30, ^bb34 + ^bb30: // pred: ^bb29 + %509 = llvm.mlir.constant(0 : index) : i64 + %510 = llvm.mlir.constant(30 : index) : i64 + %511 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb31(%509 : i64) + ^bb31(%512: i64): // 2 preds: ^bb30, ^bb32 + %513 = llvm.icmp "slt" %512, %510 : i64 + llvm.cond_br %513, ^bb32, ^bb33 + ^bb32: // pred: ^bb31 + %514 = llvm.mlir.constant(30 : index) : i64 + %515 = llvm.mul %497, %514 : i64 + %516 = llvm.mul %515, %514 : i64 + %517 = llvm.mul %507, %514 : i64 + %518 = llvm.add %516, %517 : i64 + %519 = llvm.add %518, %512 : i64 + %520 = llvm.extractvalue %107[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %521 = llvm.mlir.constant(64 : index) : i64 + %522 = llvm.mul %519, %521 : i64 + %523 = llvm.add %522, %502 : i64 + %524 = llvm.getelementptr %520[%523] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %525 = llvm.load %524 : !llvm.ptr -> f32 + %526 = llvm.extractvalue %11[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %527 = llvm.mlir.constant(57600 : index) : i64 + %528 = llvm.mul %497, %527 : i64 + %529 = llvm.mlir.constant(900 : index) : i64 + %530 = llvm.mul %502, %529 : i64 + %531 = llvm.add %528, %530 : i64 + %532 = llvm.mlir.constant(30 : index) : i64 + %533 = llvm.mul %507, %532 : i64 + %534 = llvm.add %531, %533 : i64 + %535 = llvm.add %534, %512 : i64 + %536 = llvm.getelementptr %526[%535] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.store %525, %536 : f32, !llvm.ptr + %537 = llvm.add %512, %511 : i64 + llvm.br ^bb31(%537 : i64) + ^bb33: // pred: ^bb31 + %538 = llvm.add %507, %506 : i64 + llvm.br ^bb29(%538 : i64) + ^bb34: // pred: ^bb29 + %539 = llvm.add %502, %501 : i64 + llvm.br ^bb27(%539 : i64) + ^bb35: // pred: ^bb27 + %540 = llvm.add %497, %496 : i64 + llvm.br ^bb25(%540 : i64) + ^bb36: // pred: ^bb25 + %541 = llvm.extractvalue %60[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + llvm.call @free(%541) : (!llvm.ptr) -> () + %542 = llvm.extractvalue %77[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + llvm.call @free(%542) : (!llvm.ptr) -> () + %543 = llvm.extractvalue %107[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + llvm.call @free(%543) : (!llvm.ptr) -> () + %544 = llvm.extractvalue %90[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + llvm.call @free(%544) : (!llvm.ptr) -> () + llvm.return + } +} + + diff --git a/experiments/gemmini/networks/conv2d_block_nchw.mlir b/experiments/gemmini/networks/conv2d_block_nchw.mlir new file mode 100644 index 0000000..2df3f75 --- /dev/null +++ b/experiments/gemmini/networks/conv2d_block_nchw.mlir @@ -0,0 +1,17 @@ +module { + // NCHW input: [N, C, H, W] = [1, 3, 32, 32] + // FCHW filter: [F, C, KH, KW] = [64, 3, 3, 3] + // NCHW output: [1, 64, 30, 30] (no padding, stride 1) + func.func @conv2d_block_nchw( + %input: memref<1x3x32x32xf32>, + %filter: memref<64x3x3x3xf32>, + %output: memref<1x64x30x30xf32> + ) { + linalg.conv_2d_nchw_fchw + ins(%input, %filter : + memref<1x3x32x32xf32>, memref<64x3x3x3xf32>) + outs(%output : + memref<1x64x30x30xf32>) + return + } +} From 4c3e9714526f64065ac065cec75ad9f966fd838c Mon Sep 17 00:00:00 2001 From: sparsh Date: Wed, 7 Jan 2026 17:49:18 -0800 Subject: [PATCH 08/13] Buddy Gemmini: add mini CNN conv block lowering test --- .../logs/mini_cnn_block.print-after-all.mlir | 1195 +++++++++++++++++ .../gemmini/networks/mini_cnn_block.mlir | 34 + 2 files changed, 1229 insertions(+) create mode 100644 experiments/gemmini/logs/mini_cnn_block.print-after-all.mlir create mode 100644 experiments/gemmini/networks/mini_cnn_block.mlir diff --git a/experiments/gemmini/logs/mini_cnn_block.print-after-all.mlir b/experiments/gemmini/logs/mini_cnn_block.print-after-all.mlir new file mode 100644 index 0000000..0956689 --- /dev/null +++ b/experiments/gemmini/logs/mini_cnn_block.print-after-all.mlir @@ -0,0 +1,1195 @@ +// -----// IR Dump After (anonymous namespace)::LowerLinalgToGemminiPass (convert-linalg-to-gemmini) //----- // +module { + func.func @mini_cnn_block(%arg0: memref<1x3x32x32xf32>, %arg1: memref<16x3x3x3xf32>, %arg2: memref<32x16x3x3xf32>, %arg3: memref<1x32x26x26xf32>) { + %alloc = memref.alloc() : memref<1x16x30x30xf32> + %alloc_0 = memref.alloc() : memref<1x32x26x26xf32> + %alloc_1 = memref.alloc() : memref<1x32x32x3xf32> + %alloc_2 = memref.alloc() : memref<27x16xf32> + %alloc_3 = memref.alloc() : memref<16xi32> + %alloc_4 = memref.alloc() : memref<900x16xf32> + %c30_i64 = arith.constant 30 : i64 + %c3 = arith.constant 3 : index + %c3_5 = arith.constant 3 : index + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c1_6 = arith.constant 1 : index + scf.for %arg4 = %c0 to %c1 step %c1_6 { + %c0_27 = arith.constant 0 : index + %c3_28 = arith.constant 3 : index + %c1_29 = arith.constant 1 : index + scf.for %arg5 = %c0_27 to %c3_28 step %c1_29 { + %c0_30 = arith.constant 0 : index + %c32_31 = arith.constant 32 : index + %c1_32 = arith.constant 1 : index + scf.for %arg6 = %c0_30 to %c32_31 step %c1_32 { + %c0_33 = arith.constant 0 : index + %c32_34 = arith.constant 32 : index + %c1_35 = arith.constant 1 : index + scf.for %arg7 = %c0_33 to %c32_34 step %c1_35 { + %0 = memref.load %arg0[%arg4, %arg5, %arg6, %arg7] : memref<1x3x32x32xf32> + memref.store %0, %alloc_1[%arg4, %arg6, %arg7, %arg5] : memref<1x32x32x3xf32> + } + } + } + } + %c0_7 = arith.constant 0 : index + %c16 = arith.constant 16 : index + %c1_8 = arith.constant 1 : index + scf.for %arg4 = %c0_7 to %c16 step %c1_8 { + %c0_27 = arith.constant 0 : index + %c3_28 = arith.constant 3 : index + %c1_29 = arith.constant 1 : index + scf.for %arg5 = %c0_27 to %c3_28 step %c1_29 { + %c0_30 = arith.constant 0 : index + %c3_31 = arith.constant 3 : index + %c1_32 = arith.constant 1 : index + scf.for %arg6 = %c0_30 to %c3_31 step %c1_32 { + %c0_33 = arith.constant 0 : index + %c3_34 = arith.constant 3 : index + %c1_35 = arith.constant 1 : index + scf.for %arg7 = %c0_33 to %c3_34 step %c1_35 { + %0 = arith.muli %arg6, %c3 : index + %1 = arith.muli %0, %c3_5 : index + %2 = arith.muli %arg7, %c3_5 : index + %3 = arith.addi %1, %2 : index + %4 = arith.addi %3, %arg5 : index + %5 = memref.load %arg1[%arg4, %arg5, %arg6, %arg7] : memref<16x3x3x3xf32> + memref.store %5, %alloc_2[%4, %arg4] : memref<27x16xf32> + } + } + } + } + %c3_i64 = arith.constant 3 : i64 + gemmini.tile_conv %alloc_1 %alloc_2 %alloc_3 %alloc_4 %c30_i64 %c30_i64 %c3_i64 : memref<1x32x32x3xf32> memref<27x16xf32> memref<16xi32> memref<900x16xf32> i64 i64 i64 + %c0_9 = arith.constant 0 : index + %c1_10 = arith.constant 1 : index + %c1_11 = arith.constant 1 : index + scf.for %arg4 = %c0_9 to %c1_10 step %c1_11 { + %c0_27 = arith.constant 0 : index + %c16_28 = arith.constant 16 : index + %c1_29 = arith.constant 1 : index + scf.for %arg5 = %c0_27 to %c16_28 step %c1_29 { + %c0_30 = arith.constant 0 : index + %c30 = arith.constant 30 : index + %c1_31 = arith.constant 1 : index + scf.for %arg6 = %c0_30 to %c30 step %c1_31 { + %c0_32 = arith.constant 0 : index + %c30_33 = arith.constant 30 : index + %c1_34 = arith.constant 1 : index + scf.for %arg7 = %c0_32 to %c30_33 step %c1_34 { + %c30_35 = arith.constant 30 : index + %0 = arith.muli %arg4, %c30_35 : index + %1 = arith.muli %0, %c30_35 : index + %2 = arith.muli %arg6, %c30_35 : index + %3 = arith.addi %1, %2 : index + %4 = arith.addi %3, %arg7 : index + %5 = memref.load %alloc_4[%4, %arg5] : memref<900x16xf32> + memref.store %5, %alloc[%arg4, %arg5, %arg6, %arg7] : memref<1x16x30x30xf32> + } + } + } + } + memref.dealloc %alloc_1 : memref<1x32x32x3xf32> + memref.dealloc %alloc_2 : memref<27x16xf32> + memref.dealloc %alloc_4 : memref<900x16xf32> + memref.dealloc %alloc_3 : memref<16xi32> + %alloc_12 = memref.alloc() : memref<1x30x30x16xf32> + %alloc_13 = memref.alloc() : memref<144x32xf32> + %alloc_14 = memref.alloc() : memref<32xi32> + %alloc_15 = memref.alloc() : memref<676x32xf32> + %c26_i64 = arith.constant 26 : i64 + %c3_16 = arith.constant 3 : index + %c16_17 = arith.constant 16 : index + %c0_18 = arith.constant 0 : index + %c1_19 = arith.constant 1 : index + %c1_20 = arith.constant 1 : index + scf.for %arg4 = %c0_18 to %c1_19 step %c1_20 { + %c0_27 = arith.constant 0 : index + %c16_28 = arith.constant 16 : index + %c1_29 = arith.constant 1 : index + scf.for %arg5 = %c0_27 to %c16_28 step %c1_29 { + %c0_30 = arith.constant 0 : index + %c30 = arith.constant 30 : index + %c1_31 = arith.constant 1 : index + scf.for %arg6 = %c0_30 to %c30 step %c1_31 { + %c0_32 = arith.constant 0 : index + %c30_33 = arith.constant 30 : index + %c1_34 = arith.constant 1 : index + scf.for %arg7 = %c0_32 to %c30_33 step %c1_34 { + %0 = memref.load %alloc[%arg4, %arg5, %arg6, %arg7] : memref<1x16x30x30xf32> + memref.store %0, %alloc_12[%arg4, %arg6, %arg7, %arg5] : memref<1x30x30x16xf32> + } + } + } + } + %c0_21 = arith.constant 0 : index + %c32 = arith.constant 32 : index + %c1_22 = arith.constant 1 : index + scf.for %arg4 = %c0_21 to %c32 step %c1_22 { + %c0_27 = arith.constant 0 : index + %c16_28 = arith.constant 16 : index + %c1_29 = arith.constant 1 : index + scf.for %arg5 = %c0_27 to %c16_28 step %c1_29 { + %c0_30 = arith.constant 0 : index + %c3_31 = arith.constant 3 : index + %c1_32 = arith.constant 1 : index + scf.for %arg6 = %c0_30 to %c3_31 step %c1_32 { + %c0_33 = arith.constant 0 : index + %c3_34 = arith.constant 3 : index + %c1_35 = arith.constant 1 : index + scf.for %arg7 = %c0_33 to %c3_34 step %c1_35 { + %0 = arith.muli %arg6, %c3_16 : index + %1 = arith.muli %0, %c16_17 : index + %2 = arith.muli %arg7, %c16_17 : index + %3 = arith.addi %1, %2 : index + %4 = arith.addi %3, %arg5 : index + %5 = memref.load %arg2[%arg4, %arg5, %arg6, %arg7] : memref<32x16x3x3xf32> + memref.store %5, %alloc_13[%4, %arg4] : memref<144x32xf32> + } + } + } + } + %c3_i64_23 = arith.constant 3 : i64 + gemmini.tile_conv %alloc_12 %alloc_13 %alloc_14 %alloc_15 %c26_i64 %c26_i64 %c3_i64_23 : memref<1x30x30x16xf32> memref<144x32xf32> memref<32xi32> memref<676x32xf32> i64 i64 i64 + %c0_24 = arith.constant 0 : index + %c1_25 = arith.constant 1 : index + %c1_26 = arith.constant 1 : index + scf.for %arg4 = %c0_24 to %c1_25 step %c1_26 { + %c0_27 = arith.constant 0 : index + %c32_28 = arith.constant 32 : index + %c1_29 = arith.constant 1 : index + scf.for %arg5 = %c0_27 to %c32_28 step %c1_29 { + %c0_30 = arith.constant 0 : index + %c26 = arith.constant 26 : index + %c1_31 = arith.constant 1 : index + scf.for %arg6 = %c0_30 to %c26 step %c1_31 { + %c0_32 = arith.constant 0 : index + %c26_33 = arith.constant 26 : index + %c1_34 = arith.constant 1 : index + scf.for %arg7 = %c0_32 to %c26_33 step %c1_34 { + %c26_35 = arith.constant 26 : index + %0 = arith.muli %arg4, %c26_35 : index + %1 = arith.muli %0, %c26_35 : index + %2 = arith.muli %arg6, %c26_35 : index + %3 = arith.addi %1, %2 : index + %4 = arith.addi %3, %arg7 : index + %5 = memref.load %alloc_15[%4, %arg5] : memref<676x32xf32> + memref.store %5, %alloc_0[%arg4, %arg5, %arg6, %arg7] : memref<1x32x26x26xf32> + } + } + } + } + memref.dealloc %alloc_12 : memref<1x30x30x16xf32> + memref.dealloc %alloc_13 : memref<144x32xf32> + memref.dealloc %alloc_15 : memref<676x32xf32> + memref.dealloc %alloc_14 : memref<32xi32> + linalg.copy ins(%alloc_0 : memref<1x32x26x26xf32>) outs(%arg3 : memref<1x32x26x26xf32>) + memref.dealloc %alloc : memref<1x16x30x30xf32> + memref.dealloc %alloc_0 : memref<1x32x26x26xf32> + return + } +} + + +// -----// IR Dump After (anonymous namespace)::LowerGemminiToLLVMPass (lower-gemmini) //----- // +module { + llvm.func @free(!llvm.ptr) + llvm.func @malloc(i64) -> !llvm.ptr + llvm.func @mini_cnn_block(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr, %arg23: !llvm.ptr, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64, %arg33: !llvm.ptr, %arg34: !llvm.ptr, %arg35: i64, %arg36: i64, %arg37: i64, %arg38: i64, %arg39: i64, %arg40: i64, %arg41: i64, %arg42: i64, %arg43: i64) { + %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1 = llvm.insertvalue %arg33, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %2 = llvm.insertvalue %arg34, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %3 = llvm.insertvalue %arg35, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %4 = llvm.insertvalue %arg36, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %5 = llvm.insertvalue %arg40, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %6 = llvm.insertvalue %arg37, %5[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %7 = llvm.insertvalue %arg41, %6[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %8 = llvm.insertvalue %arg38, %7[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %9 = llvm.insertvalue %arg42, %8[4, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %10 = llvm.insertvalue %arg39, %9[3, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %11 = llvm.insertvalue %arg43, %10[4, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %12 = builtin.unrealized_conversion_cast %11 : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> to memref<1x32x26x26xf32> + %13 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %14 = llvm.insertvalue %arg22, %13[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %15 = llvm.insertvalue %arg23, %14[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %16 = llvm.insertvalue %arg24, %15[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %17 = llvm.insertvalue %arg25, %16[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %18 = llvm.insertvalue %arg29, %17[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %19 = llvm.insertvalue %arg26, %18[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %20 = llvm.insertvalue %arg30, %19[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %21 = llvm.insertvalue %arg27, %20[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %22 = llvm.insertvalue %arg31, %21[4, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %23 = llvm.insertvalue %arg28, %22[3, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %24 = llvm.insertvalue %arg32, %23[4, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %25 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %26 = llvm.insertvalue %arg11, %25[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %27 = llvm.insertvalue %arg12, %26[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %28 = llvm.insertvalue %arg13, %27[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %29 = llvm.insertvalue %arg14, %28[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %30 = llvm.insertvalue %arg18, %29[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %31 = llvm.insertvalue %arg15, %30[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %32 = llvm.insertvalue %arg19, %31[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %33 = llvm.insertvalue %arg16, %32[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %34 = llvm.insertvalue %arg20, %33[4, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %35 = llvm.insertvalue %arg17, %34[3, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %36 = llvm.insertvalue %arg21, %35[4, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %37 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %38 = llvm.insertvalue %arg0, %37[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %39 = llvm.insertvalue %arg1, %38[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %40 = llvm.insertvalue %arg2, %39[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %41 = llvm.insertvalue %arg3, %40[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %42 = llvm.insertvalue %arg7, %41[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %43 = llvm.insertvalue %arg4, %42[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %44 = llvm.insertvalue %arg8, %43[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %45 = llvm.insertvalue %arg5, %44[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %46 = llvm.insertvalue %arg9, %45[4, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %47 = llvm.insertvalue %arg6, %46[3, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %48 = llvm.insertvalue %arg10, %47[4, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %49 = llvm.mlir.constant(1 : index) : i64 + %50 = llvm.mlir.constant(16 : index) : i64 + %51 = llvm.mlir.constant(30 : index) : i64 + %52 = llvm.mlir.constant(30 : index) : i64 + %53 = llvm.mlir.constant(1 : index) : i64 + %54 = llvm.mlir.constant(900 : index) : i64 + %55 = llvm.mlir.constant(14400 : index) : i64 + %56 = llvm.mlir.constant(14400 : index) : i64 + %57 = llvm.mlir.zero : !llvm.ptr + %58 = llvm.getelementptr %57[%56] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %59 = llvm.ptrtoint %58 : !llvm.ptr to i64 + %60 = llvm.call @malloc(%59) : (i64) -> !llvm.ptr + %61 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %62 = llvm.insertvalue %60, %61[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %63 = llvm.insertvalue %60, %62[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %64 = llvm.mlir.constant(0 : index) : i64 + %65 = llvm.insertvalue %64, %63[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %66 = llvm.insertvalue %49, %65[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %67 = llvm.insertvalue %50, %66[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %68 = llvm.insertvalue %51, %67[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %69 = llvm.insertvalue %52, %68[3, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %70 = llvm.insertvalue %55, %69[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %71 = llvm.insertvalue %54, %70[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %72 = llvm.insertvalue %52, %71[4, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %73 = llvm.insertvalue %53, %72[4, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %74 = llvm.mlir.constant(1 : index) : i64 + %75 = llvm.mlir.constant(32 : index) : i64 + %76 = llvm.mlir.constant(26 : index) : i64 + %77 = llvm.mlir.constant(26 : index) : i64 + %78 = llvm.mlir.constant(1 : index) : i64 + %79 = llvm.mlir.constant(676 : index) : i64 + %80 = llvm.mlir.constant(21632 : index) : i64 + %81 = llvm.mlir.constant(21632 : index) : i64 + %82 = llvm.mlir.zero : !llvm.ptr + %83 = llvm.getelementptr %82[%81] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %84 = llvm.ptrtoint %83 : !llvm.ptr to i64 + %85 = llvm.call @malloc(%84) : (i64) -> !llvm.ptr + %86 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %87 = llvm.insertvalue %85, %86[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %88 = llvm.insertvalue %85, %87[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %89 = llvm.mlir.constant(0 : index) : i64 + %90 = llvm.insertvalue %89, %88[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %91 = llvm.insertvalue %74, %90[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %92 = llvm.insertvalue %75, %91[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %93 = llvm.insertvalue %76, %92[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %94 = llvm.insertvalue %77, %93[3, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %95 = llvm.insertvalue %80, %94[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %96 = llvm.insertvalue %79, %95[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %97 = llvm.insertvalue %77, %96[4, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %98 = llvm.insertvalue %78, %97[4, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %99 = builtin.unrealized_conversion_cast %98 : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> to memref<1x32x26x26xf32> + %100 = llvm.mlir.constant(1 : index) : i64 + %101 = llvm.mlir.constant(32 : index) : i64 + %102 = llvm.mlir.constant(32 : index) : i64 + %103 = llvm.mlir.constant(3 : index) : i64 + %104 = llvm.mlir.constant(1 : index) : i64 + %105 = llvm.mlir.constant(96 : index) : i64 + %106 = llvm.mlir.constant(3072 : index) : i64 + %107 = llvm.mlir.constant(3072 : index) : i64 + %108 = llvm.mlir.zero : !llvm.ptr + %109 = llvm.getelementptr %108[%107] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %110 = llvm.ptrtoint %109 : !llvm.ptr to i64 + %111 = llvm.call @malloc(%110) : (i64) -> !llvm.ptr + %112 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %113 = llvm.insertvalue %111, %112[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %114 = llvm.insertvalue %111, %113[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %115 = llvm.mlir.constant(0 : index) : i64 + %116 = llvm.insertvalue %115, %114[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %117 = llvm.insertvalue %100, %116[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %118 = llvm.insertvalue %101, %117[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %119 = llvm.insertvalue %102, %118[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %120 = llvm.insertvalue %103, %119[3, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %121 = llvm.insertvalue %106, %120[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %122 = llvm.insertvalue %105, %121[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %123 = llvm.insertvalue %103, %122[4, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %124 = llvm.insertvalue %104, %123[4, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %125 = llvm.mlir.constant(27 : index) : i64 + %126 = llvm.mlir.constant(16 : index) : i64 + %127 = llvm.mlir.constant(1 : index) : i64 + %128 = llvm.mlir.constant(432 : index) : i64 + %129 = llvm.mlir.zero : !llvm.ptr + %130 = llvm.getelementptr %129[%128] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %131 = llvm.ptrtoint %130 : !llvm.ptr to i64 + %132 = llvm.call @malloc(%131) : (i64) -> !llvm.ptr + %133 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %134 = llvm.insertvalue %132, %133[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %135 = llvm.insertvalue %132, %134[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %136 = llvm.mlir.constant(0 : index) : i64 + %137 = llvm.insertvalue %136, %135[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %138 = llvm.insertvalue %125, %137[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %139 = llvm.insertvalue %126, %138[3, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %140 = llvm.insertvalue %126, %139[4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %141 = llvm.insertvalue %127, %140[4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %142 = llvm.mlir.constant(16 : index) : i64 + %143 = llvm.mlir.constant(1 : index) : i64 + %144 = llvm.mlir.zero : !llvm.ptr + %145 = llvm.getelementptr %144[%142] : (!llvm.ptr, i64) -> !llvm.ptr, i32 + %146 = llvm.ptrtoint %145 : !llvm.ptr to i64 + %147 = llvm.call @malloc(%146) : (i64) -> !llvm.ptr + %148 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + %149 = llvm.insertvalue %147, %148[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + %150 = llvm.insertvalue %147, %149[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + %151 = llvm.mlir.constant(0 : index) : i64 + %152 = llvm.insertvalue %151, %150[2] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + %153 = llvm.insertvalue %142, %152[3, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + %154 = llvm.insertvalue %143, %153[4, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + %155 = llvm.mlir.constant(900 : index) : i64 + %156 = llvm.mlir.constant(16 : index) : i64 + %157 = llvm.mlir.constant(1 : index) : i64 + %158 = llvm.mlir.constant(14400 : index) : i64 + %159 = llvm.mlir.zero : !llvm.ptr + %160 = llvm.getelementptr %159[%158] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %161 = llvm.ptrtoint %160 : !llvm.ptr to i64 + %162 = llvm.call @malloc(%161) : (i64) -> !llvm.ptr + %163 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %164 = llvm.insertvalue %162, %163[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %165 = llvm.insertvalue %162, %164[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %166 = llvm.mlir.constant(0 : index) : i64 + %167 = llvm.insertvalue %166, %165[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %168 = llvm.insertvalue %155, %167[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %169 = llvm.insertvalue %156, %168[3, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %170 = llvm.insertvalue %156, %169[4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %171 = llvm.insertvalue %157, %170[4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %172 = llvm.mlir.constant(30 : i64) : i64 + %173 = llvm.mlir.constant(3 : index) : i64 + %174 = llvm.mlir.constant(3 : index) : i64 + %175 = llvm.mlir.constant(0 : index) : i64 + %176 = llvm.mlir.constant(1 : index) : i64 + %177 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb1(%175 : i64) + ^bb1(%178: i64): // 2 preds: ^bb0, ^bb11 + %179 = llvm.icmp "slt" %178, %176 : i64 + llvm.cond_br %179, ^bb2, ^bb12 + ^bb2: // pred: ^bb1 + %180 = llvm.mlir.constant(0 : index) : i64 + %181 = llvm.mlir.constant(3 : index) : i64 + %182 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb3(%180 : i64) + ^bb3(%183: i64): // 2 preds: ^bb2, ^bb10 + %184 = llvm.icmp "slt" %183, %181 : i64 + llvm.cond_br %184, ^bb4, ^bb11 + ^bb4: // pred: ^bb3 + %185 = llvm.mlir.constant(0 : index) : i64 + %186 = llvm.mlir.constant(32 : index) : i64 + %187 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb5(%185 : i64) + ^bb5(%188: i64): // 2 preds: ^bb4, ^bb9 + %189 = llvm.icmp "slt" %188, %186 : i64 + llvm.cond_br %189, ^bb6, ^bb10 + ^bb6: // pred: ^bb5 + %190 = llvm.mlir.constant(0 : index) : i64 + %191 = llvm.mlir.constant(32 : index) : i64 + %192 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb7(%190 : i64) + ^bb7(%193: i64): // 2 preds: ^bb6, ^bb8 + %194 = llvm.icmp "slt" %193, %191 : i64 + llvm.cond_br %194, ^bb8, ^bb9 + ^bb8: // pred: ^bb7 + %195 = llvm.extractvalue %48[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %196 = llvm.mlir.constant(3072 : index) : i64 + %197 = llvm.mul %178, %196 : i64 + %198 = llvm.mlir.constant(1024 : index) : i64 + %199 = llvm.mul %183, %198 : i64 + %200 = llvm.add %197, %199 : i64 + %201 = llvm.mlir.constant(32 : index) : i64 + %202 = llvm.mul %188, %201 : i64 + %203 = llvm.add %200, %202 : i64 + %204 = llvm.add %203, %193 : i64 + %205 = llvm.getelementptr %195[%204] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %206 = llvm.load %205 : !llvm.ptr -> f32 + %207 = llvm.extractvalue %124[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %208 = llvm.mlir.constant(3072 : index) : i64 + %209 = llvm.mul %178, %208 : i64 + %210 = llvm.mlir.constant(96 : index) : i64 + %211 = llvm.mul %188, %210 : i64 + %212 = llvm.add %209, %211 : i64 + %213 = llvm.mlir.constant(3 : index) : i64 + %214 = llvm.mul %193, %213 : i64 + %215 = llvm.add %212, %214 : i64 + %216 = llvm.add %215, %183 : i64 + %217 = llvm.getelementptr %207[%216] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.store %206, %217 : f32, !llvm.ptr + %218 = llvm.add %193, %192 : i64 + llvm.br ^bb7(%218 : i64) + ^bb9: // pred: ^bb7 + %219 = llvm.add %188, %187 : i64 + llvm.br ^bb5(%219 : i64) + ^bb10: // pred: ^bb5 + %220 = llvm.add %183, %182 : i64 + llvm.br ^bb3(%220 : i64) + ^bb11: // pred: ^bb3 + %221 = llvm.add %178, %177 : i64 + llvm.br ^bb1(%221 : i64) + ^bb12: // pred: ^bb1 + %222 = llvm.mlir.constant(0 : index) : i64 + %223 = llvm.mlir.constant(16 : index) : i64 + %224 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb13(%222 : i64) + ^bb13(%225: i64): // 2 preds: ^bb12, ^bb23 + %226 = llvm.icmp "slt" %225, %223 : i64 + llvm.cond_br %226, ^bb14, ^bb24 + ^bb14: // pred: ^bb13 + %227 = llvm.mlir.constant(0 : index) : i64 + %228 = llvm.mlir.constant(3 : index) : i64 + %229 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb15(%227 : i64) + ^bb15(%230: i64): // 2 preds: ^bb14, ^bb22 + %231 = llvm.icmp "slt" %230, %228 : i64 + llvm.cond_br %231, ^bb16, ^bb23 + ^bb16: // pred: ^bb15 + %232 = llvm.mlir.constant(0 : index) : i64 + %233 = llvm.mlir.constant(3 : index) : i64 + %234 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb17(%232 : i64) + ^bb17(%235: i64): // 2 preds: ^bb16, ^bb21 + %236 = llvm.icmp "slt" %235, %233 : i64 + llvm.cond_br %236, ^bb18, ^bb22 + ^bb18: // pred: ^bb17 + %237 = llvm.mlir.constant(0 : index) : i64 + %238 = llvm.mlir.constant(3 : index) : i64 + %239 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb19(%237 : i64) + ^bb19(%240: i64): // 2 preds: ^bb18, ^bb20 + %241 = llvm.icmp "slt" %240, %238 : i64 + llvm.cond_br %241, ^bb20, ^bb21 + ^bb20: // pred: ^bb19 + %242 = llvm.mul %235, %173 : i64 + %243 = llvm.mul %242, %174 : i64 + %244 = llvm.mul %240, %174 : i64 + %245 = llvm.add %243, %244 : i64 + %246 = llvm.add %245, %230 : i64 + %247 = llvm.extractvalue %36[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %248 = llvm.mlir.constant(27 : index) : i64 + %249 = llvm.mul %225, %248 : i64 + %250 = llvm.mlir.constant(9 : index) : i64 + %251 = llvm.mul %230, %250 : i64 + %252 = llvm.add %249, %251 : i64 + %253 = llvm.mlir.constant(3 : index) : i64 + %254 = llvm.mul %235, %253 : i64 + %255 = llvm.add %252, %254 : i64 + %256 = llvm.add %255, %240 : i64 + %257 = llvm.getelementptr %247[%256] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %258 = llvm.load %257 : !llvm.ptr -> f32 + %259 = llvm.extractvalue %141[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %260 = llvm.mlir.constant(16 : index) : i64 + %261 = llvm.mul %246, %260 : i64 + %262 = llvm.add %261, %225 : i64 + %263 = llvm.getelementptr %259[%262] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.store %258, %263 : f32, !llvm.ptr + %264 = llvm.add %240, %239 : i64 + llvm.br ^bb19(%264 : i64) + ^bb21: // pred: ^bb19 + %265 = llvm.add %235, %234 : i64 + llvm.br ^bb17(%265 : i64) + ^bb22: // pred: ^bb17 + %266 = llvm.add %230, %229 : i64 + llvm.br ^bb15(%266 : i64) + ^bb23: // pred: ^bb15 + %267 = llvm.add %225, %224 : i64 + llvm.br ^bb13(%267 : i64) + ^bb24: // pred: ^bb13 + %268 = llvm.mlir.constant(3 : i64) : i64 + %269 = llvm.extractvalue %124[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %270 = llvm.ptrtoint %269 : !llvm.ptr to i64 + %271 = llvm.extractvalue %171[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %272 = llvm.ptrtoint %271 : !llvm.ptr to i64 + %273 = llvm.extractvalue %154[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + %274 = llvm.ptrtoint %273 : !llvm.ptr to i64 + %275 = llvm.extractvalue %141[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %276 = llvm.ptrtoint %275 : !llvm.ptr to i64 + %277 = llvm.mlir.constant(16 : i64) : i64 + %278 = llvm.mlir.constant(2 : i64) : i64 + %279 = llvm.mlir.constant(4575657221408423952 : i64) : i64 + "gemmini.intr.config_st"(%278, %279) : (i64, i64) -> () + %280 = llvm.mlir.constant(65540 : i64) : i64 + %281 = llvm.mlir.constant(281474976710656 : i64) : i64 + "gemmini.intr.config_ex"(%280, %281) : (i64, i64) -> () + %282 = llvm.mlir.constant(0 : i64) : i64 + %283 = llvm.mlir.constant(0 : i64) : i64 + %284 = llvm.mlir.constant(0 : i64) : i64 + %285 = llvm.mlir.constant(0 : i64) : i64 + %286 = llvm.mlir.constant(4503612514369537 : i64) : i64 + %287 = llvm.mlir.constant(4296933406 : i64) : i64 + "gemmini.intr.loop_conv_ws_config1"(%286, %287) : (i64, i64) -> () + %288 = llvm.mlir.constant(844429225164800 : i64) : i64 + %289 = llvm.mlir.constant(281569467498512 : i64) : i64 + "gemmini.intr.loop_conv_ws_config2"(%288, %289) : (i64, i64) -> () + %290 = llvm.mlir.constant(844437815230464 : i64) : i64 + %291 = llvm.mlir.constant(0 : i64) : i64 + "gemmini.intr.loop_conv_ws_config3"(%290, %291) : (i64, i64) -> () + %292 = llvm.mlir.constant(6192449487634432 : i64) : i64 + %293 = llvm.mlir.constant(65559 : i64) : i64 + "gemmini.intr.loop_conv_ws_config4"(%292, %293) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config5"(%276, %272) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config6"(%274, %270) : (i64, i64) -> () + %294 = llvm.mlir.constant(768 : i64) : i64 + %295 = llvm.mlir.constant(1 : i64) : i64 + "gemmini.intr.loop_conv_ws"(%294, %295) : (i64, i64) -> () + %296 = llvm.mlir.constant(368 : i64) : i64 + %297 = llvm.add %272, %296 : i64 + %298 = llvm.mlir.constant(0 : i64) : i64 + %299 = llvm.mlir.constant(0 : i64) : i64 + %300 = llvm.mlir.constant(69 : i64) : i64 + %301 = llvm.add %270, %300 : i64 + %302 = llvm.mlir.constant(4503612514369537 : i64) : i64 + %303 = llvm.mlir.constant(4296933406 : i64) : i64 + "gemmini.intr.loop_conv_ws_config1"(%302, %303) : (i64, i64) -> () + %304 = llvm.mlir.constant(844429225164800 : i64) : i64 + %305 = llvm.mlir.constant(281569466449936 : i64) : i64 + "gemmini.intr.loop_conv_ws_config2"(%304, %305) : (i64, i64) -> () + %306 = llvm.mlir.constant(844437815230464 : i64) : i64 + %307 = llvm.mlir.constant(0 : i64) : i64 + "gemmini.intr.loop_conv_ws_config3"(%306, %307) : (i64, i64) -> () + %308 = llvm.mlir.constant(6192449487634432 : i64) : i64 + %309 = llvm.mlir.constant(65543 : i64) : i64 + "gemmini.intr.loop_conv_ws_config4"(%308, %309) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config5"(%276, %297) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config6"(%274, %301) : (i64, i64) -> () + %310 = llvm.mlir.constant(768 : i64) : i64 + %311 = llvm.mlir.constant(1 : i64) : i64 + "gemmini.intr.loop_conv_ws"(%310, %311) : (i64, i64) -> () + %312 = llvm.mlir.constant(10560 : i64) : i64 + %313 = llvm.add %272, %312 : i64 + %314 = llvm.mlir.constant(0 : i64) : i64 + %315 = llvm.mlir.constant(0 : i64) : i64 + %316 = llvm.mlir.constant(2112 : i64) : i64 + %317 = llvm.add %270, %316 : i64 + %318 = llvm.mlir.constant(4503612514369537 : i64) : i64 + %319 = llvm.mlir.constant(4296933406 : i64) : i64 + "gemmini.intr.loop_conv_ws_config1"(%318, %319) : (i64, i64) -> () + %320 = llvm.mlir.constant(844429225164800 : i64) : i64 + %321 = llvm.mlir.constant(281509337956368 : i64) : i64 + "gemmini.intr.loop_conv_ws_config2"(%320, %321) : (i64, i64) -> () + %322 = llvm.mlir.constant(844437815230464 : i64) : i64 + %323 = llvm.mlir.constant(0 : i64) : i64 + "gemmini.intr.loop_conv_ws_config3"(%322, %323) : (i64, i64) -> () + %324 = llvm.mlir.constant(2251799813685248 : i64) : i64 + %325 = llvm.mlir.constant(65559 : i64) : i64 + "gemmini.intr.loop_conv_ws_config4"(%324, %325) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config5"(%276, %313) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config6"(%274, %317) : (i64, i64) -> () + %326 = llvm.mlir.constant(768 : i64) : i64 + %327 = llvm.mlir.constant(1 : i64) : i64 + "gemmini.intr.loop_conv_ws"(%326, %327) : (i64, i64) -> () + %328 = llvm.mlir.constant(10928 : i64) : i64 + %329 = llvm.add %272, %328 : i64 + %330 = llvm.mlir.constant(0 : i64) : i64 + %331 = llvm.mlir.constant(0 : i64) : i64 + %332 = llvm.mlir.constant(2181 : i64) : i64 + %333 = llvm.add %270, %332 : i64 + %334 = llvm.mlir.constant(4503612514369537 : i64) : i64 + %335 = llvm.mlir.constant(4296933406 : i64) : i64 + "gemmini.intr.loop_conv_ws_config1"(%334, %335) : (i64, i64) -> () + %336 = llvm.mlir.constant(844429225164800 : i64) : i64 + %337 = llvm.mlir.constant(281509336907792 : i64) : i64 + "gemmini.intr.loop_conv_ws_config2"(%336, %337) : (i64, i64) -> () + %338 = llvm.mlir.constant(844437815230464 : i64) : i64 + %339 = llvm.mlir.constant(0 : i64) : i64 + "gemmini.intr.loop_conv_ws_config3"(%338, %339) : (i64, i64) -> () + %340 = llvm.mlir.constant(2251799813685248 : i64) : i64 + %341 = llvm.mlir.constant(65543 : i64) : i64 + "gemmini.intr.loop_conv_ws_config4"(%340, %341) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config5"(%276, %329) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config6"(%274, %333) : (i64, i64) -> () + %342 = llvm.mlir.constant(768 : i64) : i64 + %343 = llvm.mlir.constant(1 : i64) : i64 + "gemmini.intr.loop_conv_ws"(%342, %343) : (i64, i64) -> () + %344 = llvm.mlir.constant(0 : i64) : i64 + "gemmini.intr.flush"(%344, %344) : (i64, i64) -> () + %345 = llvm.mlir.constant(0 : index) : i64 + %346 = llvm.mlir.constant(1 : index) : i64 + %347 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb25(%345 : i64) + ^bb25(%348: i64): // 2 preds: ^bb24, ^bb35 + %349 = llvm.icmp "slt" %348, %346 : i64 + llvm.cond_br %349, ^bb26, ^bb36 + ^bb26: // pred: ^bb25 + %350 = llvm.mlir.constant(0 : index) : i64 + %351 = llvm.mlir.constant(16 : index) : i64 + %352 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb27(%350 : i64) + ^bb27(%353: i64): // 2 preds: ^bb26, ^bb34 + %354 = llvm.icmp "slt" %353, %351 : i64 + llvm.cond_br %354, ^bb28, ^bb35 + ^bb28: // pred: ^bb27 + %355 = llvm.mlir.constant(0 : index) : i64 + %356 = llvm.mlir.constant(30 : index) : i64 + %357 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb29(%355 : i64) + ^bb29(%358: i64): // 2 preds: ^bb28, ^bb33 + %359 = llvm.icmp "slt" %358, %356 : i64 + llvm.cond_br %359, ^bb30, ^bb34 + ^bb30: // pred: ^bb29 + %360 = llvm.mlir.constant(0 : index) : i64 + %361 = llvm.mlir.constant(30 : index) : i64 + %362 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb31(%360 : i64) + ^bb31(%363: i64): // 2 preds: ^bb30, ^bb32 + %364 = llvm.icmp "slt" %363, %361 : i64 + llvm.cond_br %364, ^bb32, ^bb33 + ^bb32: // pred: ^bb31 + %365 = llvm.mlir.constant(30 : index) : i64 + %366 = llvm.mul %348, %365 : i64 + %367 = llvm.mul %366, %365 : i64 + %368 = llvm.mul %358, %365 : i64 + %369 = llvm.add %367, %368 : i64 + %370 = llvm.add %369, %363 : i64 + %371 = llvm.extractvalue %171[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %372 = llvm.mlir.constant(16 : index) : i64 + %373 = llvm.mul %370, %372 : i64 + %374 = llvm.add %373, %353 : i64 + %375 = llvm.getelementptr %371[%374] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %376 = llvm.load %375 : !llvm.ptr -> f32 + %377 = llvm.extractvalue %73[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %378 = llvm.mlir.constant(14400 : index) : i64 + %379 = llvm.mul %348, %378 : i64 + %380 = llvm.mlir.constant(900 : index) : i64 + %381 = llvm.mul %353, %380 : i64 + %382 = llvm.add %379, %381 : i64 + %383 = llvm.mlir.constant(30 : index) : i64 + %384 = llvm.mul %358, %383 : i64 + %385 = llvm.add %382, %384 : i64 + %386 = llvm.add %385, %363 : i64 + %387 = llvm.getelementptr %377[%386] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.store %376, %387 : f32, !llvm.ptr + %388 = llvm.add %363, %362 : i64 + llvm.br ^bb31(%388 : i64) + ^bb33: // pred: ^bb31 + %389 = llvm.add %358, %357 : i64 + llvm.br ^bb29(%389 : i64) + ^bb34: // pred: ^bb29 + %390 = llvm.add %353, %352 : i64 + llvm.br ^bb27(%390 : i64) + ^bb35: // pred: ^bb27 + %391 = llvm.add %348, %347 : i64 + llvm.br ^bb25(%391 : i64) + ^bb36: // pred: ^bb25 + %392 = llvm.extractvalue %124[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + llvm.call @free(%392) : (!llvm.ptr) -> () + %393 = llvm.extractvalue %141[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + llvm.call @free(%393) : (!llvm.ptr) -> () + %394 = llvm.extractvalue %171[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + llvm.call @free(%394) : (!llvm.ptr) -> () + %395 = llvm.extractvalue %154[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + llvm.call @free(%395) : (!llvm.ptr) -> () + %396 = llvm.mlir.constant(1 : index) : i64 + %397 = llvm.mlir.constant(30 : index) : i64 + %398 = llvm.mlir.constant(30 : index) : i64 + %399 = llvm.mlir.constant(16 : index) : i64 + %400 = llvm.mlir.constant(1 : index) : i64 + %401 = llvm.mlir.constant(480 : index) : i64 + %402 = llvm.mlir.constant(14400 : index) : i64 + %403 = llvm.mlir.constant(14400 : index) : i64 + %404 = llvm.mlir.zero : !llvm.ptr + %405 = llvm.getelementptr %404[%403] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %406 = llvm.ptrtoint %405 : !llvm.ptr to i64 + %407 = llvm.call @malloc(%406) : (i64) -> !llvm.ptr + %408 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %409 = llvm.insertvalue %407, %408[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %410 = llvm.insertvalue %407, %409[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %411 = llvm.mlir.constant(0 : index) : i64 + %412 = llvm.insertvalue %411, %410[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %413 = llvm.insertvalue %396, %412[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %414 = llvm.insertvalue %397, %413[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %415 = llvm.insertvalue %398, %414[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %416 = llvm.insertvalue %399, %415[3, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %417 = llvm.insertvalue %402, %416[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %418 = llvm.insertvalue %401, %417[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %419 = llvm.insertvalue %399, %418[4, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %420 = llvm.insertvalue %400, %419[4, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %421 = llvm.mlir.constant(144 : index) : i64 + %422 = llvm.mlir.constant(32 : index) : i64 + %423 = llvm.mlir.constant(1 : index) : i64 + %424 = llvm.mlir.constant(4608 : index) : i64 + %425 = llvm.mlir.zero : !llvm.ptr + %426 = llvm.getelementptr %425[%424] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %427 = llvm.ptrtoint %426 : !llvm.ptr to i64 + %428 = llvm.call @malloc(%427) : (i64) -> !llvm.ptr + %429 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %430 = llvm.insertvalue %428, %429[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %431 = llvm.insertvalue %428, %430[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %432 = llvm.mlir.constant(0 : index) : i64 + %433 = llvm.insertvalue %432, %431[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %434 = llvm.insertvalue %421, %433[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %435 = llvm.insertvalue %422, %434[3, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %436 = llvm.insertvalue %422, %435[4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %437 = llvm.insertvalue %423, %436[4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %438 = llvm.mlir.constant(32 : index) : i64 + %439 = llvm.mlir.constant(1 : index) : i64 + %440 = llvm.mlir.zero : !llvm.ptr + %441 = llvm.getelementptr %440[%438] : (!llvm.ptr, i64) -> !llvm.ptr, i32 + %442 = llvm.ptrtoint %441 : !llvm.ptr to i64 + %443 = llvm.call @malloc(%442) : (i64) -> !llvm.ptr + %444 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + %445 = llvm.insertvalue %443, %444[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + %446 = llvm.insertvalue %443, %445[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + %447 = llvm.mlir.constant(0 : index) : i64 + %448 = llvm.insertvalue %447, %446[2] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + %449 = llvm.insertvalue %438, %448[3, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + %450 = llvm.insertvalue %439, %449[4, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + %451 = llvm.mlir.constant(676 : index) : i64 + %452 = llvm.mlir.constant(32 : index) : i64 + %453 = llvm.mlir.constant(1 : index) : i64 + %454 = llvm.mlir.constant(21632 : index) : i64 + %455 = llvm.mlir.zero : !llvm.ptr + %456 = llvm.getelementptr %455[%454] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %457 = llvm.ptrtoint %456 : !llvm.ptr to i64 + %458 = llvm.call @malloc(%457) : (i64) -> !llvm.ptr + %459 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %460 = llvm.insertvalue %458, %459[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %461 = llvm.insertvalue %458, %460[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %462 = llvm.mlir.constant(0 : index) : i64 + %463 = llvm.insertvalue %462, %461[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %464 = llvm.insertvalue %451, %463[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %465 = llvm.insertvalue %452, %464[3, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %466 = llvm.insertvalue %452, %465[4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %467 = llvm.insertvalue %453, %466[4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %468 = llvm.mlir.constant(26 : i64) : i64 + %469 = llvm.mlir.constant(3 : index) : i64 + %470 = llvm.mlir.constant(16 : index) : i64 + %471 = llvm.mlir.constant(0 : index) : i64 + %472 = llvm.mlir.constant(1 : index) : i64 + %473 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb37(%471 : i64) + ^bb37(%474: i64): // 2 preds: ^bb36, ^bb47 + %475 = llvm.icmp "slt" %474, %472 : i64 + llvm.cond_br %475, ^bb38, ^bb48 + ^bb38: // pred: ^bb37 + %476 = llvm.mlir.constant(0 : index) : i64 + %477 = llvm.mlir.constant(16 : index) : i64 + %478 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb39(%476 : i64) + ^bb39(%479: i64): // 2 preds: ^bb38, ^bb46 + %480 = llvm.icmp "slt" %479, %477 : i64 + llvm.cond_br %480, ^bb40, ^bb47 + ^bb40: // pred: ^bb39 + %481 = llvm.mlir.constant(0 : index) : i64 + %482 = llvm.mlir.constant(30 : index) : i64 + %483 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb41(%481 : i64) + ^bb41(%484: i64): // 2 preds: ^bb40, ^bb45 + %485 = llvm.icmp "slt" %484, %482 : i64 + llvm.cond_br %485, ^bb42, ^bb46 + ^bb42: // pred: ^bb41 + %486 = llvm.mlir.constant(0 : index) : i64 + %487 = llvm.mlir.constant(30 : index) : i64 + %488 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb43(%486 : i64) + ^bb43(%489: i64): // 2 preds: ^bb42, ^bb44 + %490 = llvm.icmp "slt" %489, %487 : i64 + llvm.cond_br %490, ^bb44, ^bb45 + ^bb44: // pred: ^bb43 + %491 = llvm.extractvalue %73[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %492 = llvm.mlir.constant(14400 : index) : i64 + %493 = llvm.mul %474, %492 : i64 + %494 = llvm.mlir.constant(900 : index) : i64 + %495 = llvm.mul %479, %494 : i64 + %496 = llvm.add %493, %495 : i64 + %497 = llvm.mlir.constant(30 : index) : i64 + %498 = llvm.mul %484, %497 : i64 + %499 = llvm.add %496, %498 : i64 + %500 = llvm.add %499, %489 : i64 + %501 = llvm.getelementptr %491[%500] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %502 = llvm.load %501 : !llvm.ptr -> f32 + %503 = llvm.extractvalue %420[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %504 = llvm.mlir.constant(14400 : index) : i64 + %505 = llvm.mul %474, %504 : i64 + %506 = llvm.mlir.constant(480 : index) : i64 + %507 = llvm.mul %484, %506 : i64 + %508 = llvm.add %505, %507 : i64 + %509 = llvm.mlir.constant(16 : index) : i64 + %510 = llvm.mul %489, %509 : i64 + %511 = llvm.add %508, %510 : i64 + %512 = llvm.add %511, %479 : i64 + %513 = llvm.getelementptr %503[%512] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.store %502, %513 : f32, !llvm.ptr + %514 = llvm.add %489, %488 : i64 + llvm.br ^bb43(%514 : i64) + ^bb45: // pred: ^bb43 + %515 = llvm.add %484, %483 : i64 + llvm.br ^bb41(%515 : i64) + ^bb46: // pred: ^bb41 + %516 = llvm.add %479, %478 : i64 + llvm.br ^bb39(%516 : i64) + ^bb47: // pred: ^bb39 + %517 = llvm.add %474, %473 : i64 + llvm.br ^bb37(%517 : i64) + ^bb48: // pred: ^bb37 + %518 = llvm.mlir.constant(0 : index) : i64 + %519 = llvm.mlir.constant(32 : index) : i64 + %520 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb49(%518 : i64) + ^bb49(%521: i64): // 2 preds: ^bb48, ^bb59 + %522 = llvm.icmp "slt" %521, %519 : i64 + llvm.cond_br %522, ^bb50, ^bb60 + ^bb50: // pred: ^bb49 + %523 = llvm.mlir.constant(0 : index) : i64 + %524 = llvm.mlir.constant(16 : index) : i64 + %525 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb51(%523 : i64) + ^bb51(%526: i64): // 2 preds: ^bb50, ^bb58 + %527 = llvm.icmp "slt" %526, %524 : i64 + llvm.cond_br %527, ^bb52, ^bb59 + ^bb52: // pred: ^bb51 + %528 = llvm.mlir.constant(0 : index) : i64 + %529 = llvm.mlir.constant(3 : index) : i64 + %530 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb53(%528 : i64) + ^bb53(%531: i64): // 2 preds: ^bb52, ^bb57 + %532 = llvm.icmp "slt" %531, %529 : i64 + llvm.cond_br %532, ^bb54, ^bb58 + ^bb54: // pred: ^bb53 + %533 = llvm.mlir.constant(0 : index) : i64 + %534 = llvm.mlir.constant(3 : index) : i64 + %535 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb55(%533 : i64) + ^bb55(%536: i64): // 2 preds: ^bb54, ^bb56 + %537 = llvm.icmp "slt" %536, %534 : i64 + llvm.cond_br %537, ^bb56, ^bb57 + ^bb56: // pred: ^bb55 + %538 = llvm.mul %531, %469 : i64 + %539 = llvm.mul %538, %470 : i64 + %540 = llvm.mul %536, %470 : i64 + %541 = llvm.add %539, %540 : i64 + %542 = llvm.add %541, %526 : i64 + %543 = llvm.extractvalue %24[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %544 = llvm.mlir.constant(144 : index) : i64 + %545 = llvm.mul %521, %544 : i64 + %546 = llvm.mlir.constant(9 : index) : i64 + %547 = llvm.mul %526, %546 : i64 + %548 = llvm.add %545, %547 : i64 + %549 = llvm.mlir.constant(3 : index) : i64 + %550 = llvm.mul %531, %549 : i64 + %551 = llvm.add %548, %550 : i64 + %552 = llvm.add %551, %536 : i64 + %553 = llvm.getelementptr %543[%552] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %554 = llvm.load %553 : !llvm.ptr -> f32 + %555 = llvm.extractvalue %437[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %556 = llvm.mlir.constant(32 : index) : i64 + %557 = llvm.mul %542, %556 : i64 + %558 = llvm.add %557, %521 : i64 + %559 = llvm.getelementptr %555[%558] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.store %554, %559 : f32, !llvm.ptr + %560 = llvm.add %536, %535 : i64 + llvm.br ^bb55(%560 : i64) + ^bb57: // pred: ^bb55 + %561 = llvm.add %531, %530 : i64 + llvm.br ^bb53(%561 : i64) + ^bb58: // pred: ^bb53 + %562 = llvm.add %526, %525 : i64 + llvm.br ^bb51(%562 : i64) + ^bb59: // pred: ^bb51 + %563 = llvm.add %521, %520 : i64 + llvm.br ^bb49(%563 : i64) + ^bb60: // pred: ^bb49 + %564 = llvm.mlir.constant(3 : i64) : i64 + %565 = llvm.extractvalue %420[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %566 = llvm.ptrtoint %565 : !llvm.ptr to i64 + %567 = llvm.extractvalue %467[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %568 = llvm.ptrtoint %567 : !llvm.ptr to i64 + %569 = llvm.extractvalue %450[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + %570 = llvm.ptrtoint %569 : !llvm.ptr to i64 + %571 = llvm.extractvalue %437[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %572 = llvm.ptrtoint %571 : !llvm.ptr to i64 + %573 = llvm.mlir.constant(32 : i64) : i64 + %574 = llvm.mlir.constant(2 : i64) : i64 + %575 = llvm.mlir.constant(4575657221408423968 : i64) : i64 + "gemmini.intr.config_st"(%574, %575) : (i64, i64) -> () + %576 = llvm.mlir.constant(65540 : i64) : i64 + %577 = llvm.mlir.constant(281474976710656 : i64) : i64 + "gemmini.intr.config_ex"(%576, %577) : (i64, i64) -> () + %578 = llvm.mlir.constant(0 : i64) : i64 + %579 = llvm.mlir.constant(0 : i64) : i64 + %580 = llvm.mlir.constant(0 : i64) : i64 + %581 = llvm.mlir.constant(0 : i64) : i64 + %582 = llvm.mlir.constant(9007267976183809 : i64) : i64 + %583 = llvm.mlir.constant(4296671258 : i64) : i64 + "gemmini.intr.loop_conv_ws_config1"(%582, %583) : (i64, i64) -> () + %584 = llvm.mlir.constant(844429225164800 : i64) : i64 + %585 = llvm.mlir.constant(281569467498512 : i64) : i64 + "gemmini.intr.loop_conv_ws_config2"(%584, %585) : (i64, i64) -> () + %586 = llvm.mlir.constant(844437816082432 : i64) : i64 + %587 = llvm.mlir.constant(0 : i64) : i64 + "gemmini.intr.loop_conv_ws_config3"(%586, %587) : (i64, i64) -> () + %588 = llvm.mlir.constant(6192449487634432 : i64) : i64 + %589 = llvm.mlir.constant(65559 : i64) : i64 + "gemmini.intr.loop_conv_ws_config4"(%588, %589) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config5"(%572, %568) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config6"(%570, %566) : (i64, i64) -> () + %590 = llvm.mlir.constant(256 : i64) : i64 + %591 = llvm.mlir.constant(1 : i64) : i64 + "gemmini.intr.loop_conv_ws"(%590, %591) : (i64, i64) -> () + %592 = llvm.mlir.constant(16 : i64) : i64 + %593 = llvm.add %568, %592 : i64 + %594 = llvm.mlir.constant(64 : i64) : i64 + %595 = llvm.add %570, %594 : i64 + %596 = llvm.mlir.constant(16 : i64) : i64 + %597 = llvm.add %572, %596 : i64 + %598 = llvm.mlir.constant(0 : i64) : i64 + %599 = llvm.mlir.constant(9007267976183809 : i64) : i64 + %600 = llvm.mlir.constant(4296671258 : i64) : i64 + "gemmini.intr.loop_conv_ws_config1"(%599, %600) : (i64, i64) -> () + %601 = llvm.mlir.constant(844429225164800 : i64) : i64 + %602 = llvm.mlir.constant(281569467498512 : i64) : i64 + "gemmini.intr.loop_conv_ws_config2"(%601, %602) : (i64, i64) -> () + %603 = llvm.mlir.constant(844437816082432 : i64) : i64 + %604 = llvm.mlir.constant(0 : i64) : i64 + "gemmini.intr.loop_conv_ws_config3"(%603, %604) : (i64, i64) -> () + %605 = llvm.mlir.constant(6192449487634432 : i64) : i64 + %606 = llvm.mlir.constant(65559 : i64) : i64 + "gemmini.intr.loop_conv_ws_config4"(%605, %606) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config5"(%597, %593) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config6"(%595, %566) : (i64, i64) -> () + %607 = llvm.mlir.constant(256 : i64) : i64 + %608 = llvm.mlir.constant(1 : i64) : i64 + "gemmini.intr.loop_conv_ws"(%607, %608) : (i64, i64) -> () + %609 = llvm.mlir.constant(736 : i64) : i64 + %610 = llvm.add %568, %609 : i64 + %611 = llvm.mlir.constant(0 : i64) : i64 + %612 = llvm.mlir.constant(0 : i64) : i64 + %613 = llvm.mlir.constant(368 : i64) : i64 + %614 = llvm.add %566, %613 : i64 + %615 = llvm.mlir.constant(9007267976183809 : i64) : i64 + %616 = llvm.mlir.constant(4296671258 : i64) : i64 + "gemmini.intr.loop_conv_ws_config1"(%615, %616) : (i64, i64) -> () + %617 = llvm.mlir.constant(844429225164800 : i64) : i64 + %618 = llvm.mlir.constant(281569466187792 : i64) : i64 + "gemmini.intr.loop_conv_ws_config2"(%617, %618) : (i64, i64) -> () + %619 = llvm.mlir.constant(844437816082432 : i64) : i64 + %620 = llvm.mlir.constant(0 : i64) : i64 + "gemmini.intr.loop_conv_ws_config3"(%619, %620) : (i64, i64) -> () + %621 = llvm.mlir.constant(6192449487634432 : i64) : i64 + %622 = llvm.mlir.constant(65539 : i64) : i64 + "gemmini.intr.loop_conv_ws_config4"(%621, %622) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config5"(%572, %610) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config6"(%570, %614) : (i64, i64) -> () + %623 = llvm.mlir.constant(256 : i64) : i64 + %624 = llvm.mlir.constant(1 : i64) : i64 + "gemmini.intr.loop_conv_ws"(%623, %624) : (i64, i64) -> () + %625 = llvm.mlir.constant(752 : i64) : i64 + %626 = llvm.add %568, %625 : i64 + %627 = llvm.mlir.constant(64 : i64) : i64 + %628 = llvm.add %570, %627 : i64 + %629 = llvm.mlir.constant(16 : i64) : i64 + %630 = llvm.add %572, %629 : i64 + %631 = llvm.mlir.constant(368 : i64) : i64 + %632 = llvm.add %566, %631 : i64 + %633 = llvm.mlir.constant(9007267976183809 : i64) : i64 + %634 = llvm.mlir.constant(4296671258 : i64) : i64 + "gemmini.intr.loop_conv_ws_config1"(%633, %634) : (i64, i64) -> () + %635 = llvm.mlir.constant(844429225164800 : i64) : i64 + %636 = llvm.mlir.constant(281569466187792 : i64) : i64 + "gemmini.intr.loop_conv_ws_config2"(%635, %636) : (i64, i64) -> () + %637 = llvm.mlir.constant(844437816082432 : i64) : i64 + %638 = llvm.mlir.constant(0 : i64) : i64 + "gemmini.intr.loop_conv_ws_config3"(%637, %638) : (i64, i64) -> () + %639 = llvm.mlir.constant(6192449487634432 : i64) : i64 + %640 = llvm.mlir.constant(65539 : i64) : i64 + "gemmini.intr.loop_conv_ws_config4"(%639, %640) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config5"(%630, %626) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config6"(%628, %632) : (i64, i64) -> () + %641 = llvm.mlir.constant(256 : i64) : i64 + %642 = llvm.mlir.constant(1 : i64) : i64 + "gemmini.intr.loop_conv_ws"(%641, %642) : (i64, i64) -> () + %643 = llvm.mlir.constant(18304 : i64) : i64 + %644 = llvm.add %568, %643 : i64 + %645 = llvm.mlir.constant(0 : i64) : i64 + %646 = llvm.mlir.constant(0 : i64) : i64 + %647 = llvm.mlir.constant(10560 : i64) : i64 + %648 = llvm.add %566, %647 : i64 + %649 = llvm.mlir.constant(9007267976183809 : i64) : i64 + %650 = llvm.mlir.constant(4296671258 : i64) : i64 + "gemmini.intr.loop_conv_ws_config1"(%649, %650) : (i64, i64) -> () + %651 = llvm.mlir.constant(844429225164800 : i64) : i64 + %652 = llvm.mlir.constant(281492158087184 : i64) : i64 + "gemmini.intr.loop_conv_ws_config2"(%651, %652) : (i64, i64) -> () + %653 = llvm.mlir.constant(844437816082432 : i64) : i64 + %654 = llvm.mlir.constant(0 : i64) : i64 + "gemmini.intr.loop_conv_ws_config3"(%653, %654) : (i64, i64) -> () + %655 = llvm.mlir.constant(1125899906842624 : i64) : i64 + %656 = llvm.mlir.constant(65559 : i64) : i64 + "gemmini.intr.loop_conv_ws_config4"(%655, %656) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config5"(%572, %644) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config6"(%570, %648) : (i64, i64) -> () + %657 = llvm.mlir.constant(256 : i64) : i64 + %658 = llvm.mlir.constant(1 : i64) : i64 + "gemmini.intr.loop_conv_ws"(%657, %658) : (i64, i64) -> () + %659 = llvm.mlir.constant(18320 : i64) : i64 + %660 = llvm.add %568, %659 : i64 + %661 = llvm.mlir.constant(64 : i64) : i64 + %662 = llvm.add %570, %661 : i64 + %663 = llvm.mlir.constant(16 : i64) : i64 + %664 = llvm.add %572, %663 : i64 + %665 = llvm.mlir.constant(10560 : i64) : i64 + %666 = llvm.add %566, %665 : i64 + %667 = llvm.mlir.constant(9007267976183809 : i64) : i64 + %668 = llvm.mlir.constant(4296671258 : i64) : i64 + "gemmini.intr.loop_conv_ws_config1"(%667, %668) : (i64, i64) -> () + %669 = llvm.mlir.constant(844429225164800 : i64) : i64 + %670 = llvm.mlir.constant(281492158087184 : i64) : i64 + "gemmini.intr.loop_conv_ws_config2"(%669, %670) : (i64, i64) -> () + %671 = llvm.mlir.constant(844437816082432 : i64) : i64 + %672 = llvm.mlir.constant(0 : i64) : i64 + "gemmini.intr.loop_conv_ws_config3"(%671, %672) : (i64, i64) -> () + %673 = llvm.mlir.constant(1125899906842624 : i64) : i64 + %674 = llvm.mlir.constant(65559 : i64) : i64 + "gemmini.intr.loop_conv_ws_config4"(%673, %674) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config5"(%664, %660) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config6"(%662, %666) : (i64, i64) -> () + %675 = llvm.mlir.constant(256 : i64) : i64 + %676 = llvm.mlir.constant(1 : i64) : i64 + "gemmini.intr.loop_conv_ws"(%675, %676) : (i64, i64) -> () + %677 = llvm.mlir.constant(19040 : i64) : i64 + %678 = llvm.add %568, %677 : i64 + %679 = llvm.mlir.constant(0 : i64) : i64 + %680 = llvm.mlir.constant(0 : i64) : i64 + %681 = llvm.mlir.constant(10928 : i64) : i64 + %682 = llvm.add %566, %681 : i64 + %683 = llvm.mlir.constant(9007267976183809 : i64) : i64 + %684 = llvm.mlir.constant(4296671258 : i64) : i64 + "gemmini.intr.loop_conv_ws_config1"(%683, %684) : (i64, i64) -> () + %685 = llvm.mlir.constant(844429225164800 : i64) : i64 + %686 = llvm.mlir.constant(281492156776464 : i64) : i64 + "gemmini.intr.loop_conv_ws_config2"(%685, %686) : (i64, i64) -> () + %687 = llvm.mlir.constant(844437816082432 : i64) : i64 + %688 = llvm.mlir.constant(0 : i64) : i64 + "gemmini.intr.loop_conv_ws_config3"(%687, %688) : (i64, i64) -> () + %689 = llvm.mlir.constant(1125899906842624 : i64) : i64 + %690 = llvm.mlir.constant(65539 : i64) : i64 + "gemmini.intr.loop_conv_ws_config4"(%689, %690) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config5"(%572, %678) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config6"(%570, %682) : (i64, i64) -> () + %691 = llvm.mlir.constant(256 : i64) : i64 + %692 = llvm.mlir.constant(1 : i64) : i64 + "gemmini.intr.loop_conv_ws"(%691, %692) : (i64, i64) -> () + %693 = llvm.mlir.constant(19056 : i64) : i64 + %694 = llvm.add %568, %693 : i64 + %695 = llvm.mlir.constant(64 : i64) : i64 + %696 = llvm.add %570, %695 : i64 + %697 = llvm.mlir.constant(16 : i64) : i64 + %698 = llvm.add %572, %697 : i64 + %699 = llvm.mlir.constant(10928 : i64) : i64 + %700 = llvm.add %566, %699 : i64 + %701 = llvm.mlir.constant(9007267976183809 : i64) : i64 + %702 = llvm.mlir.constant(4296671258 : i64) : i64 + "gemmini.intr.loop_conv_ws_config1"(%701, %702) : (i64, i64) -> () + %703 = llvm.mlir.constant(844429225164800 : i64) : i64 + %704 = llvm.mlir.constant(281492156776464 : i64) : i64 + "gemmini.intr.loop_conv_ws_config2"(%703, %704) : (i64, i64) -> () + %705 = llvm.mlir.constant(844437816082432 : i64) : i64 + %706 = llvm.mlir.constant(0 : i64) : i64 + "gemmini.intr.loop_conv_ws_config3"(%705, %706) : (i64, i64) -> () + %707 = llvm.mlir.constant(1125899906842624 : i64) : i64 + %708 = llvm.mlir.constant(65539 : i64) : i64 + "gemmini.intr.loop_conv_ws_config4"(%707, %708) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config5"(%698, %694) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config6"(%696, %700) : (i64, i64) -> () + %709 = llvm.mlir.constant(256 : i64) : i64 + %710 = llvm.mlir.constant(1 : i64) : i64 + "gemmini.intr.loop_conv_ws"(%709, %710) : (i64, i64) -> () + %711 = llvm.mlir.constant(0 : i64) : i64 + "gemmini.intr.flush"(%711, %711) : (i64, i64) -> () + %712 = llvm.mlir.constant(0 : index) : i64 + %713 = llvm.mlir.constant(1 : index) : i64 + %714 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb61(%712 : i64) + ^bb61(%715: i64): // 2 preds: ^bb60, ^bb71 + %716 = llvm.icmp "slt" %715, %713 : i64 + llvm.cond_br %716, ^bb62, ^bb72 + ^bb62: // pred: ^bb61 + %717 = llvm.mlir.constant(0 : index) : i64 + %718 = llvm.mlir.constant(32 : index) : i64 + %719 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb63(%717 : i64) + ^bb63(%720: i64): // 2 preds: ^bb62, ^bb70 + %721 = llvm.icmp "slt" %720, %718 : i64 + llvm.cond_br %721, ^bb64, ^bb71 + ^bb64: // pred: ^bb63 + %722 = llvm.mlir.constant(0 : index) : i64 + %723 = llvm.mlir.constant(26 : index) : i64 + %724 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb65(%722 : i64) + ^bb65(%725: i64): // 2 preds: ^bb64, ^bb69 + %726 = llvm.icmp "slt" %725, %723 : i64 + llvm.cond_br %726, ^bb66, ^bb70 + ^bb66: // pred: ^bb65 + %727 = llvm.mlir.constant(0 : index) : i64 + %728 = llvm.mlir.constant(26 : index) : i64 + %729 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb67(%727 : i64) + ^bb67(%730: i64): // 2 preds: ^bb66, ^bb68 + %731 = llvm.icmp "slt" %730, %728 : i64 + llvm.cond_br %731, ^bb68, ^bb69 + ^bb68: // pred: ^bb67 + %732 = llvm.mlir.constant(26 : index) : i64 + %733 = llvm.mul %715, %732 : i64 + %734 = llvm.mul %733, %732 : i64 + %735 = llvm.mul %725, %732 : i64 + %736 = llvm.add %734, %735 : i64 + %737 = llvm.add %736, %730 : i64 + %738 = llvm.extractvalue %467[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %739 = llvm.mlir.constant(32 : index) : i64 + %740 = llvm.mul %737, %739 : i64 + %741 = llvm.add %740, %720 : i64 + %742 = llvm.getelementptr %738[%741] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %743 = llvm.load %742 : !llvm.ptr -> f32 + %744 = llvm.extractvalue %98[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %745 = llvm.mlir.constant(21632 : index) : i64 + %746 = llvm.mul %715, %745 : i64 + %747 = llvm.mlir.constant(676 : index) : i64 + %748 = llvm.mul %720, %747 : i64 + %749 = llvm.add %746, %748 : i64 + %750 = llvm.mlir.constant(26 : index) : i64 + %751 = llvm.mul %725, %750 : i64 + %752 = llvm.add %749, %751 : i64 + %753 = llvm.add %752, %730 : i64 + %754 = llvm.getelementptr %744[%753] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.store %743, %754 : f32, !llvm.ptr + %755 = llvm.add %730, %729 : i64 + llvm.br ^bb67(%755 : i64) + ^bb69: // pred: ^bb67 + %756 = llvm.add %725, %724 : i64 + llvm.br ^bb65(%756 : i64) + ^bb70: // pred: ^bb65 + %757 = llvm.add %720, %719 : i64 + llvm.br ^bb63(%757 : i64) + ^bb71: // pred: ^bb63 + %758 = llvm.add %715, %714 : i64 + llvm.br ^bb61(%758 : i64) + ^bb72: // pred: ^bb61 + %759 = llvm.extractvalue %420[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + llvm.call @free(%759) : (!llvm.ptr) -> () + %760 = llvm.extractvalue %437[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + llvm.call @free(%760) : (!llvm.ptr) -> () + %761 = llvm.extractvalue %467[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + llvm.call @free(%761) : (!llvm.ptr) -> () + %762 = llvm.extractvalue %450[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + llvm.call @free(%762) : (!llvm.ptr) -> () + linalg.copy ins(%99 : memref<1x32x26x26xf32>) outs(%12 : memref<1x32x26x26xf32>) + %763 = llvm.extractvalue %73[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + llvm.call @free(%763) : (!llvm.ptr) -> () + %764 = llvm.extractvalue %98[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + llvm.call @free(%764) : (!llvm.ptr) -> () + llvm.return + } +} + + diff --git a/experiments/gemmini/networks/mini_cnn_block.mlir b/experiments/gemmini/networks/mini_cnn_block.mlir new file mode 100644 index 0000000..6ae76a9 --- /dev/null +++ b/experiments/gemmini/networks/mini_cnn_block.mlir @@ -0,0 +1,34 @@ +module { + func.func @mini_cnn_block( + %input: memref<1x3x32x32xf32>, // NCHW input + %w1: memref<16x3x3x3xf32>, // conv1 weights + %w2: memref<32x16x3x3xf32>, // conv2 weights + %out: memref<1x32x26x26xf32> // final output after conv2 + ) { + %conv1 = memref.alloc() : memref<1x16x30x30xf32> + %conv2 = memref.alloc() : memref<1x32x26x26xf32> + + // Conv 1: 3x3, stride 1, NCHW x FCHW + linalg.conv_2d_nchw_fchw + ins(%input, %w1 + : memref<1x3x32x32xf32>, memref<16x3x3x3xf32>) + outs(%conv1 + : memref<1x16x30x30xf32>) + + // Conv 2: 3x3, stride 1, NCHW x FCHW + linalg.conv_2d_nchw_fchw + ins(%conv1, %w2 + : memref<1x16x30x30xf32>, memref<32x16x3x3xf32>) + outs(%conv2 + : memref<1x32x26x26xf32>) + + // Just copy conv2 -> out for now (no FC yet) + linalg.copy + ins(%conv2 : memref<1x32x26x26xf32>) + outs(%out : memref<1x32x26x26xf32>) + + memref.dealloc %conv1 : memref<1x16x30x30xf32> + memref.dealloc %conv2 : memref<1x32x26x26xf32> + return + } +} From 4f19571451d05eda7050f3cbbf46860d366787df Mon Sep 17 00:00:00 2001 From: sparsh Date: Fri, 16 Jan 2026 19:05:57 -0800 Subject: [PATCH 09/13] Docs: add Gemmini lowering coverage table --- experiments/gemmini/SUPPORT.md | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 experiments/gemmini/SUPPORT.md diff --git a/experiments/gemmini/SUPPORT.md b/experiments/gemmini/SUPPORT.md new file mode 100644 index 0000000..06d455a --- /dev/null +++ b/experiments/gemmini/SUPPORT.md @@ -0,0 +1,11 @@ +# Buddy Gemmini lowering coverage (Sparsh) + +This table is a quick view of what we’ve stress-tested and what Buddy lowers into. + +| Test | Input dialect/op | Layout | Proof of Gemmini match | Proof of Gemmini command expansion | Notes | +|---|---|---|---|---|---| +| matmul | linalg.matmul | (varies) | `gemmini.tile_matmul` | `gemmini.intr.loop_ws_config*` + `gemmini.intr.loop_ws` | matmul lowered end-to-end | +| batch_matmul | linalg.batch_matmul | (varies) | `gemmini.tile_*` | `gemmini.intr.*` | batched path works | +| conv (NHWC/HWCF) | linalg.conv_2d_nhwc_hwcf | NHWC x HWCF | `gemmini.tile_conv` | `gemmini.intr.loop_conv_ws_config*` + `gemmini.intr.loop_conv_ws` | conv lowered to WS loop | +| conv (NCHW/FCHW) | linalg.conv_2d_nchw_fchw | NCHW x FCHW | `gemmini.tile_conv` | `gemmini.intr.loop_conv_ws_config*` + `gemmini.intr.loop_conv_ws` | alternate layout works | +| mini CNN block | 2x conv + copy | NCHW/FCHW | 2x `gemmini.tile_conv` | `gemmini.intr.loop_conv_ws*` appears | multi-layer block lowers | From a81f917524c3d75011c6ede586efc26fe7f1e7ea Mon Sep 17 00:00:00 2001 From: sparsh Date: Fri, 16 Jan 2026 20:35:43 -0800 Subject: [PATCH 10/13] IREE: add iree-compile IR dump (print-after-all) for baseline pipeline --- .../iree/logs/iree.print-after-all.mlir | 27771 ++++++++++++++++ 1 file changed, 27771 insertions(+) create mode 100644 experiments/iree/logs/iree.print-after-all.mlir diff --git a/experiments/iree/logs/iree.print-after-all.mlir b/experiments/iree/logs/iree.print-after-all.mlir new file mode 100644 index 0000000..509d405 --- /dev/null +++ b/experiments/iree/logs/iree.print-after-all.mlir @@ -0,0 +1,27771 @@ +// -----// IR Dump After AutoInputConversionPipelinePass (iree-auto-input-conversion) //----- // +module { + func.func @multiple_results(%arg0: tensor<2xf32>, %arg1: tensor<2xf32>) -> (tensor<2xf32>, tensor<2xf32>) { + %0 = math.absf %arg0 : tensor<2xf32> + %1 = math.absf %arg1 : tensor<2xf32> + return %0, %1 : tensor<2xf32>, tensor<2xf32> + } +} + + +// -----// IR Dump After IREEImportPublicPass (iree-import-public) //----- // +module { + util.func public @multiple_results(%arg0: tensor<2xf32>, %arg1: tensor<2xf32>) -> (tensor<2xf32>, tensor<2xf32>) { + %0 = math.absf %arg0 : tensor<2xf32> + %1 = math.absf %arg1 : tensor<2xf32> + util.return %0, %1 : tensor<2xf32>, tensor<2xf32> + } +} + + +// -----// IR Dump After ImportMLProgramPass (iree-import-ml-program) //----- // +module { + util.func public @multiple_results(%arg0: tensor<2xf32>, %arg1: tensor<2xf32>) -> (tensor<2xf32>, tensor<2xf32>) { + %0 = math.absf %arg0 : tensor<2xf32> + %1 = math.absf %arg1 : tensor<2xf32> + util.return %0, %1 : tensor<2xf32>, tensor<2xf32> + } +} + + +// -----// IR Dump After SanitizeModuleNamesPass (iree-sanitize-module-names) //----- // +module { + util.func public @multiple_results(%arg0: tensor<2xf32>, %arg1: tensor<2xf32>) -> (tensor<2xf32>, tensor<2xf32>) { + %0 = math.absf %arg0 : tensor<2xf32> + %1 = math.absf %arg1 : tensor<2xf32> + util.return %0, %1 : tensor<2xf32>, tensor<2xf32> + } +} + + +// -----// IR Dump After ConvertShardToFlowPass (iree-convert-shard-to-flow) //----- // +module { + util.func public @multiple_results(%arg0: tensor<2xf32>, %arg1: tensor<2xf32>) -> (tensor<2xf32>, tensor<2xf32>) { + %0 = math.absf %arg0 : tensor<2xf32> + %1 = math.absf %arg1 : tensor<2xf32> + util.return %0, %1 : tensor<2xf32>, tensor<2xf32> + } +} + + +// -----// IR Dump After DemoteF64ToF32Pass (iree-input-conversion-demote-f64-to-f32) //----- // +module { + util.func public @multiple_results(%arg0: tensor<2xf32>, %arg1: tensor<2xf32>) -> (tensor<2xf32>, tensor<2xf32>) { + %0 = math.absf %arg0 : tensor<2xf32> + %1 = math.absf %arg1 : tensor<2xf32> + util.return %0, %1 : tensor<2xf32>, tensor<2xf32> + } +} + + +// -----// IR Dump After ConvertStreamableOpsPass (iree-abi-convert-streamable-ops) //----- // +module { + util.func public @multiple_results(%arg0: tensor<2xf32>, %arg1: tensor<2xf32>) -> (tensor<2xf32>, tensor<2xf32>) { + %0 = math.absf %arg0 : tensor<2xf32> + %1 = math.absf %arg1 : tensor<2xf32> + util.return %0, %1 : tensor<2xf32>, tensor<2xf32> + } +} + + +// -----// IR Dump After WrapEntryPointsPass (iree-abi-wrap-entry-points) //----- // +module { + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %2:2 = util.call @_multiple_results(%0, %1) : (tensor<2xf32>, tensor<2xf32>) -> (tensor<2xf32>, tensor<2xf32>) + %3 = hal.tensor.export %2#0 "output0" : tensor<2xf32> -> !hal.buffer_view + %4 = hal.tensor.export %2#1 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %3, %4 : !hal.buffer_view, !hal.buffer_view + } + util.func private @_multiple_results(%arg0: tensor<2xf32>, %arg1: tensor<2xf32>) -> (tensor<2xf32>, tensor<2xf32>) attributes {hal.abi.convention = #hal.abi.convention} { + %0 = math.absf %arg0 : tensor<2xf32> + %1 = math.absf %arg1 : tensor<2xf32> + util.return %0, %1 : tensor<2xf32>, tensor<2xf32> + } +} + + +// -----// IR Dump After Canonicalizer (canonicalize) //----- // +util.func private @_multiple_results(%arg0: tensor<2xf32>, %arg1: tensor<2xf32>) -> (tensor<2xf32>, tensor<2xf32>) attributes {hal.abi.convention = #hal.abi.convention} { + %0 = math.absf %arg0 : tensor<2xf32> + %1 = math.absf %arg1 : tensor<2xf32> + util.return %0, %1 : tensor<2xf32>, tensor<2xf32> +} + +// -----// IR Dump After Canonicalizer (canonicalize) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %2:2 = util.call @_multiple_results(%0, %1) : (tensor<2xf32>, tensor<2xf32>) -> (tensor<2xf32>, tensor<2xf32>) + %3 = hal.tensor.export %2#0 "output0" : tensor<2xf32> -> !hal.buffer_view + %4 = hal.tensor.export %2#1 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %3, %4 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After Canonicalizer (canonicalize) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %2 = math.absf %0 : tensor<2xf32> + %3 = math.absf %1 : tensor<2xf32> + %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view + %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %4, %5 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After Inliner (inline) //----- // +module { + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %2 = math.absf %0 : tensor<2xf32> + %3 = math.absf %1 : tensor<2xf32> + %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view + %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %4, %5 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After Canonicalizer (canonicalize) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %2 = math.absf %0 : tensor<2xf32> + %3 = math.absf %1 : tensor<2xf32> + %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view + %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %4, %5 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After CSE (cse) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %2 = math.absf %0 : tensor<2xf32> + %3 = math.absf %1 : tensor<2xf32> + %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view + %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %4, %5 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After SymbolDCE (symbol-dce) //----- // +module { + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %2 = math.absf %0 : tensor<2xf32> + %3 = math.absf %1 : tensor<2xf32> + %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view + %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %4, %5 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After AssignLegacyTargetDevicesPass (iree-hal-assign-legacy-target-devices) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {hal.device.targets = [#device_target_local]} { + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %2 = math.absf %0 : tensor<2xf32> + %3 = math.absf %1 : tensor<2xf32> + %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view + %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %4, %5 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After MaterializeTargetDevicesPass (iree-hal-materialize-target-devices) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %2 = math.absf %0 : tensor<2xf32> + %3 = math.absf %1 : tensor<2xf32> + %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view + %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %4, %5 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After ResolveDevicePromisesPass (iree-hal-resolve-device-promises) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %2 = math.absf %0 : tensor<2xf32> + %3 = math.absf %1 : tensor<2xf32> + %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view + %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %4, %5 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After ResolveDeviceAliasesPass (iree-hal-resolve-device-aliases) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %2 = math.absf %0 : tensor<2xf32> + %3 = math.absf %1 : tensor<2xf32> + %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view + %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %4, %5 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After VerifyDevicesPass (iree-hal-verify-devices) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %2 = math.absf %0 : tensor<2xf32> + %3 = math.absf %1 : tensor<2xf32> + %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view + %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %4, %5 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After AttrBasedPipelinePass (iree-preprocessing-attr-based-pipeline) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %2 = math.absf %0 : tensor<2xf32> + %3 = math.absf %1 : tensor<2xf32> + %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view + %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %4, %5 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After WarnOnUninitializedValuesPass (iree-global-opt-warn-on-uninitialized-values) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %2 = math.absf %0 : tensor<2xf32> + %3 = math.absf %1 : tensor<2xf32> + %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view + %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %4, %5 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %2 = math.absf %0 : tensor<2xf32> + %3 = math.absf %1 : tensor<2xf32> + %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view + %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %4, %5 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After LinalgQuantizedConvToConvPass (iree-global-opt-quantized-conv-to-conv) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %2 = math.absf %0 : tensor<2xf32> + %3 = math.absf %1 : tensor<2xf32> + %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view + %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %4, %5 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After LinalgQuantizedMatmulToMatmulPass (iree-global-opt-quantized-matmul-to-matmul) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %2 = math.absf %0 : tensor<2xf32> + %3 = math.absf %1 : tensor<2xf32> + %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view + %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %4, %5 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After CanonicalizePass (iree-flow-canonicalize) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %2 = math.absf %0 : tensor<2xf32> + %3 = math.absf %1 : tensor<2xf32> + %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view + %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %4, %5 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After RemoveZeroExtentTensorsPass (iree-global-opt-remove-zero-extent-tensors) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %2 = math.absf %0 : tensor<2xf32> + %3 = math.absf %1 : tensor<2xf32> + %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view + %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %4, %5 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After DetachElementwiseFromNamedOpsPass (iree-global-opt-detach-elementwise-from-named-ops) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %2 = math.absf %0 : tensor<2xf32> + %3 = math.absf %1 : tensor<2xf32> + %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view + %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %4, %5 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After SimplifyDepthwiseConvPass (simplify-depthwise-conv) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %2 = math.absf %0 : tensor<2xf32> + %3 = math.absf %1 : tensor<2xf32> + %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view + %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %4, %5 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After EraseUnusedLinalgOperandsPass (iree-global-opt-erase-unused-linalg-operands) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %2 = math.absf %0 : tensor<2xf32> + %3 = math.absf %1 : tensor<2xf32> + %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view + %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %4, %5 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After ExpandTensorShapesPass (iree-global-opt-expand-tensor-shapes) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %2 = math.absf %0 : tensor<2xf32> + %3 = math.absf %1 : tensor<2xf32> + %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view + %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %4, %5 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After ConvertElementwiseToLinalgPass (convert-elementwise-to-linalg) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<2xf32>) outs(%0 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %6 = math.absf %in : f32 + linalg.yield %6 : f32 + } -> tensor<2xf32> + %3 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %6 = math.absf %in : f32 + linalg.yield %6 : f32 + } -> tensor<2xf32> + %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view + %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %4, %5 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After RaiseSpecialOpsPass (iree-global-opt-raise-special-ops) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<2xf32>) outs(%0 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %6 = math.absf %in : f32 + linalg.yield %6 : f32 + } -> tensor<2xf32> + %3 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %6 = math.absf %in : f32 + linalg.yield %6 : f32 + } -> tensor<2xf32> + %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view + %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %4, %5 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After DecomposeConcatPass (iree-global-opt-decompose-concat) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<2xf32>) outs(%0 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %6 = math.absf %in : f32 + linalg.yield %6 : f32 + } -> tensor<2xf32> + %3 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %6 = math.absf %in : f32 + linalg.yield %6 : f32 + } -> tensor<2xf32> + %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view + %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %4, %5 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After GeneralizeLinalgNamedOpsPass (iree-global-opt-generalize-linalg-named-ops) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<2xf32>) outs(%0 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %6 = math.absf %in : f32 + linalg.yield %6 : f32 + } -> tensor<2xf32> + %3 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %6 = math.absf %in : f32 + linalg.yield %6 : f32 + } -> tensor<2xf32> + %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view + %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %4, %5 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After InsertTensorBarriersPass (iree-dispatch-creation-insert-tensor-barriers) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32> + %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32> + %4 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %10 = math.absf %in : f32 + linalg.yield %10 : f32 + } -> tensor<2xf32> + %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32> + %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %10 = math.absf %in : f32 + linalg.yield %10 : f32 + } -> tensor<2xf32> + %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32> + %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view + %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %8, %9 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After FoldUnitExtentDimsPass (iree-dispatch-creation-fold-unit-extent-dims) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32> + %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32> + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %10 = math.absf %in : f32 + linalg.yield %10 : f32 + } -> tensor<2xf32> + %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32> + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %10 = math.absf %in : f32 + linalg.yield %10 : f32 + } -> tensor<2xf32> + %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32> + %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view + %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %8, %9 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After DemoteContractionInputsToBF16Pass (iree-global-opt-demote-contraction-inputs-to-bf16) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32> + %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32> + %4 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %10 = math.absf %in : f32 + linalg.yield %10 : f32 + } -> tensor<2xf32> + %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32> + %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %10 = math.absf %in : f32 + linalg.yield %10 : f32 + } -> tensor<2xf32> + %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32> + %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view + %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %8, %9 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After CanonicalizePass (iree-flow-canonicalize) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32> + %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32> + %4 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %10 = math.absf %in : f32 + linalg.yield %10 : f32 + } -> tensor<2xf32> + %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32> + %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %10 = math.absf %in : f32 + linalg.yield %10 : f32 + } -> tensor<2xf32> + %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32> + %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view + %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %8, %9 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After CSE (cse) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32> + %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32> + %4 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %10 = math.absf %in : f32 + linalg.yield %10 : f32 + } -> tensor<2xf32> + %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32> + %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %10 = math.absf %in : f32 + linalg.yield %10 : f32 + } -> tensor<2xf32> + %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32> + %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view + %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %8, %9 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After PropagateLinalgTransposePass (iree-global-opt-propagate-linalg-transpose) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32> + %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32> + %4 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %10 = math.absf %in : f32 + linalg.yield %10 : f32 + } -> tensor<2xf32> + %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32> + %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %10 = math.absf %in : f32 + linalg.yield %10 : f32 + } -> tensor<2xf32> + %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32> + %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view + %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %8, %9 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After CanonicalizePass (iree-flow-canonicalize) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32> + %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32> + %4 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %10 = math.absf %in : f32 + linalg.yield %10 : f32 + } -> tensor<2xf32> + %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32> + %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %10 = math.absf %in : f32 + linalg.yield %10 : f32 + } -> tensor<2xf32> + %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32> + %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view + %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %8, %9 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After CSE (cse) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32> + %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32> + %4 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %10 = math.absf %in : f32 + linalg.yield %10 : f32 + } -> tensor<2xf32> + %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32> + %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %10 = math.absf %in : f32 + linalg.yield %10 : f32 + } -> tensor<2xf32> + %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32> + %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view + %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %8, %9 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After ConvertStridedContractionToContractionPass (iree-global-opt-convert-strided-contraction-to-contraction) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32> + %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32> + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %10 = math.absf %in : f32 + linalg.yield %10 : f32 + } -> tensor<2xf32> + %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32> + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %10 = math.absf %in : f32 + linalg.yield %10 : f32 + } -> tensor<2xf32> + %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32> + %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view + %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %8, %9 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After GeneralizeLinalgNamedOpsPass (iree-global-opt-generalize-linalg-named-ops) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32> + %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32> + %4 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %10 = math.absf %in : f32 + linalg.yield %10 : f32 + } -> tensor<2xf32> + %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32> + %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %10 = math.absf %in : f32 + linalg.yield %10 : f32 + } -> tensor<2xf32> + %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32> + %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view + %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %8, %9 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After GlobalLoopInvariantCodeMotionPass (iree-global-opt-loop-invariant-code-motion) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32> + %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32> + %4 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %10 = math.absf %in : f32 + linalg.yield %10 : f32 + } -> tensor<2xf32> + %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32> + %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %10 = math.absf %in : f32 + linalg.yield %10 : f32 + } -> tensor<2xf32> + %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32> + %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view + %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %8, %9 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After CanonicalizePass (iree-flow-canonicalize) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32> + %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32> + %4 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %10 = math.absf %in : f32 + linalg.yield %10 : f32 + } -> tensor<2xf32> + %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32> + %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %10 = math.absf %in : f32 + linalg.yield %10 : f32 + } -> tensor<2xf32> + %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32> + %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view + %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %8, %9 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After CSE (cse) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32> + %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32> + %4 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %10 = math.absf %in : f32 + linalg.yield %10 : f32 + } -> tensor<2xf32> + %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32> + %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %10 = math.absf %in : f32 + linalg.yield %10 : f32 + } -> tensor<2xf32> + %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32> + %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view + %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %8, %9 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32> + %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32> + %4 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %10 = math.absf %in : f32 + linalg.yield %10 : f32 + } -> tensor<2xf32> + %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32> + %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %10 = math.absf %in : f32 + linalg.yield %10 : f32 + } -> tensor<2xf32> + %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32> + %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view + %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %8, %9 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32> + %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32> + %4 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %10 = math.absf %in : f32 + linalg.yield %10 : f32 + } -> tensor<2xf32> + %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32> + %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %10 = math.absf %in : f32 + linalg.yield %10 : f32 + } -> tensor<2xf32> + %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32> + %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view + %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %8, %9 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32> + %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32> + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %10 = math.absf %in : f32 + linalg.yield %10 : f32 + } -> tensor<2xf32> + %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32> + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %10 = math.absf %in : f32 + linalg.yield %10 : f32 + } -> tensor<2xf32> + %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32> + %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view + %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %8, %9 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After IPOPass (iree-util-ipo) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32> + %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32> + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %10 = math.absf %in : f32 + linalg.yield %10 : f32 + } -> tensor<2xf32> + %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32> + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %10 = math.absf %in : f32 + linalg.yield %10 : f32 + } -> tensor<2xf32> + %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32> + %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view + %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %8, %9 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32> + %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32> + %4 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %10 = math.absf %in : f32 + linalg.yield %10 : f32 + } -> tensor<2xf32> + %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32> + %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %10 = math.absf %in : f32 + linalg.yield %10 : f32 + } -> tensor<2xf32> + %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32> + %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view + %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %8, %9 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After CanonicalizePass (iree-flow-canonicalize) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32> + %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32> + %4 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %10 = math.absf %in : f32 + linalg.yield %10 : f32 + } -> tensor<2xf32> + %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32> + %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %10 = math.absf %in : f32 + linalg.yield %10 : f32 + } -> tensor<2xf32> + %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32> + %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view + %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %8, %9 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After CSE (cse) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32> + %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32> + %4 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %10 = math.absf %in : f32 + linalg.yield %10 : f32 + } -> tensor<2xf32> + %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32> + %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %10 = math.absf %in : f32 + linalg.yield %10 : f32 + } -> tensor<2xf32> + %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32> + %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view + %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %8, %9 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After HoistIntoGlobalsPass (iree-util-hoist-into-globals) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32> + %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32> + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %10 = math.absf %in : f32 + linalg.yield %10 : f32 + } -> tensor<2xf32> + %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32> + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %10 = math.absf %in : f32 + linalg.yield %10 : f32 + } -> tensor<2xf32> + %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32> + %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view + %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %8, %9 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After JitGlobalsPass (iree-consteval-jit-globals) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32> + %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32> + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %10 = math.absf %in : f32 + linalg.yield %10 : f32 + } -> tensor<2xf32> + %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32> + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %10 = math.absf %in : f32 + linalg.yield %10 : f32 + } -> tensor<2xf32> + %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32> + %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view + %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %8, %9 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After CanonicalizePass (iree-flow-canonicalize) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32> + %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32> + %4 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %10 = math.absf %in : f32 + linalg.yield %10 : f32 + } -> tensor<2xf32> + %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32> + %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %10 = math.absf %in : f32 + linalg.yield %10 : f32 + } -> tensor<2xf32> + %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32> + %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view + %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %8, %9 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After CSE (cse) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32> + %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32> + %4 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %10 = math.absf %in : f32 + linalg.yield %10 : f32 + } -> tensor<2xf32> + %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32> + %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %10 = math.absf %in : f32 + linalg.yield %10 : f32 + } -> tensor<2xf32> + %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32> + %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view + %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %8, %9 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After RaiseSpecialOpsPass (iree-global-opt-raise-special-ops) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32> + %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32> + %4 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %10 = math.absf %in : f32 + linalg.yield %10 : f32 + } -> tensor<2xf32> + %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32> + %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %10 = math.absf %in : f32 + linalg.yield %10 : f32 + } -> tensor<2xf32> + %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32> + %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view + %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %8, %9 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After InjectTensorTracingPass (iree-flow-inject-tensor-tracing) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32> + %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32> + %4 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %10 = math.absf %in : f32 + linalg.yield %10 : f32 + } -> tensor<2xf32> + %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32> + %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %10 = math.absf %in : f32 + linalg.yield %10 : f32 + } -> tensor<2xf32> + %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32> + %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view + %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %8, %9 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After TensorPadToTensorInsertSlicePass (iree-dispatch-creation-tensor-pad-to-tensor-insert-slice) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32> + %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32> + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %10 = math.absf %in : f32 + linalg.yield %10 : f32 + } -> tensor<2xf32> + %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32> + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %10 = math.absf %in : f32 + linalg.yield %10 : f32 + } -> tensor<2xf32> + %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32> + %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view + %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %8, %9 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After CanonicalizePass (iree-flow-canonicalize) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32> + %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32> + %4 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %10 = math.absf %in : f32 + linalg.yield %10 : f32 + } -> tensor<2xf32> + %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32> + %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %10 = math.absf %in : f32 + linalg.yield %10 : f32 + } -> tensor<2xf32> + %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32> + %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view + %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %8, %9 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After CSE (cse) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32> + %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32> + %4 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %10 = math.absf %in : f32 + linalg.yield %10 : f32 + } -> tensor<2xf32> + %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32> + %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %10 = math.absf %in : f32 + linalg.yield %10 : f32 + } -> tensor<2xf32> + %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32> + %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view + %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %8, %9 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32> + %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32> + %4 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %10 = math.absf %in : f32 + linalg.yield %10 : f32 + } -> tensor<2xf32> + %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32> + %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %10 = math.absf %in : f32 + linalg.yield %10 : f32 + } -> tensor<2xf32> + %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32> + %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view + %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %8, %9 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32> + %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32> + %4 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %10 = math.absf %in : f32 + linalg.yield %10 : f32 + } -> tensor<2xf32> + %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32> + %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %10 = math.absf %in : f32 + linalg.yield %10 : f32 + } -> tensor<2xf32> + %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32> + %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view + %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %8, %9 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32> + %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32> + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %10 = math.absf %in : f32 + linalg.yield %10 : f32 + } -> tensor<2xf32> + %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32> + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %10 = math.absf %in : f32 + linalg.yield %10 : f32 + } -> tensor<2xf32> + %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32> + %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view + %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %8, %9 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32> + %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32> + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %10 = math.absf %in : f32 + linalg.yield %10 : f32 + } -> tensor<2xf32> + %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32> + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %10 = math.absf %in : f32 + linalg.yield %10 : f32 + } -> tensor<2xf32> + %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32> + %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view + %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %8, %9 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After IPOPass (iree-util-ipo) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32> + %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32> + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %10 = math.absf %in : f32 + linalg.yield %10 : f32 + } -> tensor<2xf32> + %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32> + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %10 = math.absf %in : f32 + linalg.yield %10 : f32 + } -> tensor<2xf32> + %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32> + %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view + %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %8, %9 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After FixedPointIteratorPass (iree-util-fixed-point-iterator) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32> + %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32> + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %10 = math.absf %in : f32 + linalg.yield %10 : f32 + } -> tensor<2xf32> + %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32> + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %10 = math.absf %in : f32 + linalg.yield %10 : f32 + } -> tensor<2xf32> + %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32> + %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view + %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %8, %9 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After FusionPreprocessingPass (iree-dispatch-creation-fusion-preprocessing) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32> + %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32> + %4 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %10 = math.absf %in : f32 + linalg.yield %10 : f32 + } -> tensor<2xf32> + %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32> + %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %10 = math.absf %in : f32 + linalg.yield %10 : f32 + } -> tensor<2xf32> + %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32> + %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view + %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %8, %9 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After CanonicalizePass (iree-flow-canonicalize) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32> + %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32> + %4 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %10 = math.absf %in : f32 + linalg.yield %10 : f32 + } -> tensor<2xf32> + %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32> + %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %10 = math.absf %in : f32 + linalg.yield %10 : f32 + } -> tensor<2xf32> + %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32> + %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view + %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %8, %9 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After CSE (cse) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32> + %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32> + %4 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%1 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %10 = math.absf %in : f32 + linalg.yield %10 : f32 + } -> tensor<2xf32> + %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32> + %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %10 = math.absf %in : f32 + linalg.yield %10 : f32 + } -> tensor<2xf32> + %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32> + %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view + %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %8, %9 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After ElementwiseOpFusionPass (iree-dispatch-creation-elementwise-op-fusion) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32> + %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32> + %4 = tensor.empty() : tensor<2xf32> + %5 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%4 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %12 = math.absf %in : f32 + linalg.yield %12 : f32 + } -> tensor<2xf32> + %6 = iree_tensor_ext.compute_barrier.end %5 : tensor<2xf32> -> tensor<2xf32> + %7 = tensor.empty() : tensor<2xf32> + %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%7 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %12 = math.absf %in : f32 + linalg.yield %12 : f32 + } -> tensor<2xf32> + %9 = iree_tensor_ext.compute_barrier.end %8 : tensor<2xf32> -> tensor<2xf32> + %10 = hal.tensor.export %6 "output0" : tensor<2xf32> -> !hal.buffer_view + %11 = hal.tensor.export %9 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %10, %11 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After CanonicalizePass (iree-flow-canonicalize) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32> + %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32> + %4 = tensor.empty() : tensor<2xf32> + %5 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%4 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %12 = math.absf %in : f32 + linalg.yield %12 : f32 + } -> tensor<2xf32> + %6 = iree_tensor_ext.compute_barrier.end %5 : tensor<2xf32> -> tensor<2xf32> + %7 = tensor.empty() : tensor<2xf32> + %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%7 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %12 = math.absf %in : f32 + linalg.yield %12 : f32 + } -> tensor<2xf32> + %9 = iree_tensor_ext.compute_barrier.end %8 : tensor<2xf32> -> tensor<2xf32> + %10 = hal.tensor.export %6 "output0" : tensor<2xf32> -> !hal.buffer_view + %11 = hal.tensor.export %9 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %10, %11 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After CSE (cse) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32> + %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32> + %4 = tensor.empty() : tensor<2xf32> + %5 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%4 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %11 = math.absf %in : f32 + linalg.yield %11 : f32 + } -> tensor<2xf32> + %6 = iree_tensor_ext.compute_barrier.end %5 : tensor<2xf32> -> tensor<2xf32> + %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%4 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %11 = math.absf %in : f32 + linalg.yield %11 : f32 + } -> tensor<2xf32> + %8 = iree_tensor_ext.compute_barrier.end %7 : tensor<2xf32> -> tensor<2xf32> + %9 = hal.tensor.export %6 "output0" : tensor<2xf32> -> !hal.buffer_view + %10 = hal.tensor.export %8 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %9, %10 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After BubbleUpExpandShapesPass (iree-dispatch-creation-bubble-up-expand-shapes) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32> + %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32> + %4 = tensor.empty() : tensor<2xf32> + %5 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%4 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %11 = math.absf %in : f32 + linalg.yield %11 : f32 + } -> tensor<2xf32> + %6 = iree_tensor_ext.compute_barrier.end %5 : tensor<2xf32> -> tensor<2xf32> + %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%4 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %11 = math.absf %in : f32 + linalg.yield %11 : f32 + } -> tensor<2xf32> + %8 = iree_tensor_ext.compute_barrier.end %7 : tensor<2xf32> -> tensor<2xf32> + %9 = hal.tensor.export %6 "output0" : tensor<2xf32> -> !hal.buffer_view + %10 = hal.tensor.export %8 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %9, %10 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After CanonicalizePass (iree-flow-canonicalize) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32> + %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32> + %4 = tensor.empty() : tensor<2xf32> + %5 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%4 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %11 = math.absf %in : f32 + linalg.yield %11 : f32 + } -> tensor<2xf32> + %6 = iree_tensor_ext.compute_barrier.end %5 : tensor<2xf32> -> tensor<2xf32> + %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%4 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %11 = math.absf %in : f32 + linalg.yield %11 : f32 + } -> tensor<2xf32> + %8 = iree_tensor_ext.compute_barrier.end %7 : tensor<2xf32> -> tensor<2xf32> + %9 = hal.tensor.export %6 "output0" : tensor<2xf32> -> !hal.buffer_view + %10 = hal.tensor.export %8 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %9, %10 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After CSE (cse) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32> + %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32> + %4 = tensor.empty() : tensor<2xf32> + %5 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%4 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %11 = math.absf %in : f32 + linalg.yield %11 : f32 + } -> tensor<2xf32> + %6 = iree_tensor_ext.compute_barrier.end %5 : tensor<2xf32> -> tensor<2xf32> + %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%4 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %11 = math.absf %in : f32 + linalg.yield %11 : f32 + } -> tensor<2xf32> + %8 = iree_tensor_ext.compute_barrier.end %7 : tensor<2xf32> -> tensor<2xf32> + %9 = hal.tensor.export %6 "output0" : tensor<2xf32> -> !hal.buffer_view + %10 = hal.tensor.export %8 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %9, %10 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After ElementwiseOpFusionPass (iree-dispatch-creation-elementwise-op-fusion) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32> + %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32> + %4 = tensor.empty() : tensor<2xf32> + %5 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%4 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %11 = math.absf %in : f32 + linalg.yield %11 : f32 + } -> tensor<2xf32> + %6 = iree_tensor_ext.compute_barrier.end %5 : tensor<2xf32> -> tensor<2xf32> + %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%4 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %11 = math.absf %in : f32 + linalg.yield %11 : f32 + } -> tensor<2xf32> + %8 = iree_tensor_ext.compute_barrier.end %7 : tensor<2xf32> -> tensor<2xf32> + %9 = hal.tensor.export %6 "output0" : tensor<2xf32> -> !hal.buffer_view + %10 = hal.tensor.export %8 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %9, %10 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After CanonicalizePass (iree-flow-canonicalize) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32> + %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32> + %4 = tensor.empty() : tensor<2xf32> + %5 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%4 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %11 = math.absf %in : f32 + linalg.yield %11 : f32 + } -> tensor<2xf32> + %6 = iree_tensor_ext.compute_barrier.end %5 : tensor<2xf32> -> tensor<2xf32> + %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%4 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %11 = math.absf %in : f32 + linalg.yield %11 : f32 + } -> tensor<2xf32> + %8 = iree_tensor_ext.compute_barrier.end %7 : tensor<2xf32> -> tensor<2xf32> + %9 = hal.tensor.export %6 "output0" : tensor<2xf32> -> !hal.buffer_view + %10 = hal.tensor.export %8 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %9, %10 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After CSE (cse) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32> + %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32> + %4 = tensor.empty() : tensor<2xf32> + %5 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%4 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %11 = math.absf %in : f32 + linalg.yield %11 : f32 + } -> tensor<2xf32> + %6 = iree_tensor_ext.compute_barrier.end %5 : tensor<2xf32> -> tensor<2xf32> + %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%4 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %11 = math.absf %in : f32 + linalg.yield %11 : f32 + } -> tensor<2xf32> + %8 = iree_tensor_ext.compute_barrier.end %7 : tensor<2xf32> -> tensor<2xf32> + %9 = hal.tensor.export %6 "output0" : tensor<2xf32> -> !hal.buffer_view + %10 = hal.tensor.export %8 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %9, %10 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After SinkReshapesPass (iree-dispatch-creation-sink-reshapes) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32> + %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32> + %4 = tensor.empty() : tensor<2xf32> + %5 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%4 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %11 = math.absf %in : f32 + linalg.yield %11 : f32 + } -> tensor<2xf32> + %6 = iree_tensor_ext.compute_barrier.end %5 : tensor<2xf32> -> tensor<2xf32> + %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%4 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %11 = math.absf %in : f32 + linalg.yield %11 : f32 + } -> tensor<2xf32> + %8 = iree_tensor_ext.compute_barrier.end %7 : tensor<2xf32> -> tensor<2xf32> + %9 = hal.tensor.export %6 "output0" : tensor<2xf32> -> !hal.buffer_view + %10 = hal.tensor.export %8 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %9, %10 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After CanonicalizePass (iree-flow-canonicalize) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32> + %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32> + %4 = tensor.empty() : tensor<2xf32> + %5 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%4 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %11 = math.absf %in : f32 + linalg.yield %11 : f32 + } -> tensor<2xf32> + %6 = iree_tensor_ext.compute_barrier.end %5 : tensor<2xf32> -> tensor<2xf32> + %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%4 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %11 = math.absf %in : f32 + linalg.yield %11 : f32 + } -> tensor<2xf32> + %8 = iree_tensor_ext.compute_barrier.end %7 : tensor<2xf32> -> tensor<2xf32> + %9 = hal.tensor.export %6 "output0" : tensor<2xf32> -> !hal.buffer_view + %10 = hal.tensor.export %8 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %9, %10 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After CSE (cse) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32> + %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32> + %4 = tensor.empty() : tensor<2xf32> + %5 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%4 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %11 = math.absf %in : f32 + linalg.yield %11 : f32 + } -> tensor<2xf32> + %6 = iree_tensor_ext.compute_barrier.end %5 : tensor<2xf32> -> tensor<2xf32> + %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%4 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %11 = math.absf %in : f32 + linalg.yield %11 : f32 + } -> tensor<2xf32> + %8 = iree_tensor_ext.compute_barrier.end %7 : tensor<2xf32> -> tensor<2xf32> + %9 = hal.tensor.export %6 "output0" : tensor<2xf32> -> !hal.buffer_view + %10 = hal.tensor.export %8 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %9, %10 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After FuseMultiUseElementwiseProducerPass (iree-dispatch-creation-fuse-multi-use-elementwise-producer) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32> + %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32> + %4 = tensor.empty() : tensor<2xf32> + %5 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%4 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %11 = math.absf %in : f32 + linalg.yield %11 : f32 + } -> tensor<2xf32> + %6 = iree_tensor_ext.compute_barrier.end %5 : tensor<2xf32> -> tensor<2xf32> + %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%4 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %11 = math.absf %in : f32 + linalg.yield %11 : f32 + } -> tensor<2xf32> + %8 = iree_tensor_ext.compute_barrier.end %7 : tensor<2xf32> -> tensor<2xf32> + %9 = hal.tensor.export %6 "output0" : tensor<2xf32> -> !hal.buffer_view + %10 = hal.tensor.export %8 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %9, %10 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After CanonicalizePass (iree-flow-canonicalize) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32> + %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32> + %4 = tensor.empty() : tensor<2xf32> + %5 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%4 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %11 = math.absf %in : f32 + linalg.yield %11 : f32 + } -> tensor<2xf32> + %6 = iree_tensor_ext.compute_barrier.end %5 : tensor<2xf32> -> tensor<2xf32> + %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%4 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %11 = math.absf %in : f32 + linalg.yield %11 : f32 + } -> tensor<2xf32> + %8 = iree_tensor_ext.compute_barrier.end %7 : tensor<2xf32> -> tensor<2xf32> + %9 = hal.tensor.export %6 "output0" : tensor<2xf32> -> !hal.buffer_view + %10 = hal.tensor.export %8 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %9, %10 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After CSE (cse) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32> + %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32> + %4 = tensor.empty() : tensor<2xf32> + %5 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%4 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %11 = math.absf %in : f32 + linalg.yield %11 : f32 + } -> tensor<2xf32> + %6 = iree_tensor_ext.compute_barrier.end %5 : tensor<2xf32> -> tensor<2xf32> + %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%4 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %11 = math.absf %in : f32 + linalg.yield %11 : f32 + } -> tensor<2xf32> + %8 = iree_tensor_ext.compute_barrier.end %7 : tensor<2xf32> -> tensor<2xf32> + %9 = hal.tensor.export %6 "output0" : tensor<2xf32> -> !hal.buffer_view + %10 = hal.tensor.export %8 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %9, %10 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After SplitReductionPass (iree-dispatch-creation-split-reduction-ops) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32> + %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32> + %4 = tensor.empty() : tensor<2xf32> + %5 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%4 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %11 = math.absf %in : f32 + linalg.yield %11 : f32 + } -> tensor<2xf32> + %6 = iree_tensor_ext.compute_barrier.end %5 : tensor<2xf32> -> tensor<2xf32> + %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%4 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %11 = math.absf %in : f32 + linalg.yield %11 : f32 + } -> tensor<2xf32> + %8 = iree_tensor_ext.compute_barrier.end %7 : tensor<2xf32> -> tensor<2xf32> + %9 = hal.tensor.export %6 "output0" : tensor<2xf32> -> !hal.buffer_view + %10 = hal.tensor.export %8 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %9, %10 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After FormSplitReductionDispatchesPass (iree-dispatch-creation-form-split-reduction-dispatches) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32> + %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32> + %4 = tensor.empty() : tensor<2xf32> + %5 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%4 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %11 = math.absf %in : f32 + linalg.yield %11 : f32 + } -> tensor<2xf32> + %6 = iree_tensor_ext.compute_barrier.end %5 : tensor<2xf32> -> tensor<2xf32> + %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%4 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %11 = math.absf %in : f32 + linalg.yield %11 : f32 + } -> tensor<2xf32> + %8 = iree_tensor_ext.compute_barrier.end %7 : tensor<2xf32> -> tensor<2xf32> + %9 = hal.tensor.export %6 "output0" : tensor<2xf32> -> !hal.buffer_view + %10 = hal.tensor.export %8 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %9, %10 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After TransposeGenericOpsPass (iree-dispatch-creation-transpose-generic-ops) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32> + %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32> + %4 = tensor.empty() : tensor<2xf32> + %5 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%4 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %11 = math.absf %in : f32 + linalg.yield %11 : f32 + } -> tensor<2xf32> + %6 = iree_tensor_ext.compute_barrier.end %5 : tensor<2xf32> -> tensor<2xf32> + %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%4 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %11 = math.absf %in : f32 + linalg.yield %11 : f32 + } -> tensor<2xf32> + %8 = iree_tensor_ext.compute_barrier.end %7 : tensor<2xf32> -> tensor<2xf32> + %9 = hal.tensor.export %6 "output0" : tensor<2xf32> -> !hal.buffer_view + %10 = hal.tensor.export %8 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %9, %10 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After PropagateEncodingsPass (iree-dispatch-creation-propagate-encodings) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32> + %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32> + %4 = tensor.empty() : tensor<2xf32> + %5 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%4 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %11 = math.absf %in : f32 + linalg.yield %11 : f32 + } -> tensor<2xf32> + %6 = iree_tensor_ext.compute_barrier.end %5 : tensor<2xf32> -> tensor<2xf32> + %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%4 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %11 = math.absf %in : f32 + linalg.yield %11 : f32 + } -> tensor<2xf32> + %8 = iree_tensor_ext.compute_barrier.end %7 : tensor<2xf32> -> tensor<2xf32> + %9 = hal.tensor.export %6 "output0" : tensor<2xf32> -> !hal.buffer_view + %10 = hal.tensor.export %8 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %9, %10 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After HoistIntoGlobalsPass (iree-util-hoist-into-globals) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32> + %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32> + %4 = tensor.empty() : tensor<2xf32> + %5 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%4 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %11 = math.absf %in : f32 + linalg.yield %11 : f32 + } -> tensor<2xf32> + %6 = iree_tensor_ext.compute_barrier.end %5 : tensor<2xf32> -> tensor<2xf32> + %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%4 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %11 = math.absf %in : f32 + linalg.yield %11 : f32 + } -> tensor<2xf32> + %8 = iree_tensor_ext.compute_barrier.end %7 : tensor<2xf32> -> tensor<2xf32> + %9 = hal.tensor.export %6 "output0" : tensor<2xf32> -> !hal.buffer_view + %10 = hal.tensor.export %8 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %9, %10 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After CanonicalizePass (iree-flow-canonicalize) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32> + %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32> + %4 = tensor.empty() : tensor<2xf32> + %5 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%4 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %11 = math.absf %in : f32 + linalg.yield %11 : f32 + } -> tensor<2xf32> + %6 = iree_tensor_ext.compute_barrier.end %5 : tensor<2xf32> -> tensor<2xf32> + %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%4 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %11 = math.absf %in : f32 + linalg.yield %11 : f32 + } -> tensor<2xf32> + %8 = iree_tensor_ext.compute_barrier.end %7 : tensor<2xf32> -> tensor<2xf32> + %9 = hal.tensor.export %6 "output0" : tensor<2xf32> -> !hal.buffer_view + %10 = hal.tensor.export %8 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %9, %10 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After CSE (cse) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32> + %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32> + %4 = tensor.empty() : tensor<2xf32> + %5 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%4 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %11 = math.absf %in : f32 + linalg.yield %11 : f32 + } -> tensor<2xf32> + %6 = iree_tensor_ext.compute_barrier.end %5 : tensor<2xf32> -> tensor<2xf32> + %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%4 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %11 = math.absf %in : f32 + linalg.yield %11 : f32 + } -> tensor<2xf32> + %8 = iree_tensor_ext.compute_barrier.end %7 : tensor<2xf32> -> tensor<2xf32> + %9 = hal.tensor.export %6 "output0" : tensor<2xf32> -> !hal.buffer_view + %10 = hal.tensor.export %8 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %9, %10 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After FormScalarDispatchesPass (iree-dispatch-creation-form-scalar-dispatches) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32> + %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32> + %4 = tensor.empty() : tensor<2xf32> + %5 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%4 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %11 = math.absf %in : f32 + linalg.yield %11 : f32 + } -> tensor<2xf32> + %6 = iree_tensor_ext.compute_barrier.end %5 : tensor<2xf32> -> tensor<2xf32> + %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%4 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %11 = math.absf %in : f32 + linalg.yield %11 : f32 + } -> tensor<2xf32> + %8 = iree_tensor_ext.compute_barrier.end %7 : tensor<2xf32> -> tensor<2xf32> + %9 = hal.tensor.export %6 "output0" : tensor<2xf32> -> !hal.buffer_view + %10 = hal.tensor.export %8 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %9, %10 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After FormDispatchRegionsPass (iree-dispatch-creation-form-dispatch-regions) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32> + %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32> + %4 = tensor.empty() : tensor<2xf32> + %5 = flow.dispatch.region -> (tensor<2xf32>) { + %11 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%4 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %12 = math.absf %in : f32 + linalg.yield %12 : f32 + } -> tensor<2xf32> + flow.return %11 : tensor<2xf32> + } + %6 = iree_tensor_ext.compute_barrier.end %5 : tensor<2xf32> -> tensor<2xf32> + %7 = flow.dispatch.region -> (tensor<2xf32>) { + %11 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%4 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %12 = math.absf %in : f32 + linalg.yield %12 : f32 + } -> tensor<2xf32> + flow.return %11 : tensor<2xf32> + } + %8 = iree_tensor_ext.compute_barrier.end %7 : tensor<2xf32> -> tensor<2xf32> + %9 = hal.tensor.export %6 "output0" : tensor<2xf32> -> !hal.buffer_view + %10 = hal.tensor.export %8 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %9, %10 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After ElementwiseOpFusionPass (iree-dispatch-creation-elementwise-op-fusion) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32> + %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32> + %4 = tensor.empty() : tensor<2xf32> + %5 = flow.dispatch.region -> (tensor<2xf32>) { + %11 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%4 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %12 = math.absf %in : f32 + linalg.yield %12 : f32 + } -> tensor<2xf32> + flow.return %11 : tensor<2xf32> + } + %6 = iree_tensor_ext.compute_barrier.end %5 : tensor<2xf32> -> tensor<2xf32> + %7 = flow.dispatch.region -> (tensor<2xf32>) { + %11 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%4 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %12 = math.absf %in : f32 + linalg.yield %12 : f32 + } -> tensor<2xf32> + flow.return %11 : tensor<2xf32> + } + %8 = iree_tensor_ext.compute_barrier.end %7 : tensor<2xf32> -> tensor<2xf32> + %9 = hal.tensor.export %6 "output0" : tensor<2xf32> -> !hal.buffer_view + %10 = hal.tensor.export %8 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %9, %10 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After FuseMultiUseElementwiseProducerPass (iree-dispatch-creation-fuse-multi-use-elementwise-producer) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32> + %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32> + %4 = tensor.empty() : tensor<2xf32> + %5 = flow.dispatch.region -> (tensor<2xf32>) { + %11 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%4 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %12 = math.absf %in : f32 + linalg.yield %12 : f32 + } -> tensor<2xf32> + flow.return %11 : tensor<2xf32> + } + %6 = iree_tensor_ext.compute_barrier.end %5 : tensor<2xf32> -> tensor<2xf32> + %7 = flow.dispatch.region -> (tensor<2xf32>) { + %11 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%4 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %12 = math.absf %in : f32 + linalg.yield %12 : f32 + } -> tensor<2xf32> + flow.return %11 : tensor<2xf32> + } + %8 = iree_tensor_ext.compute_barrier.end %7 : tensor<2xf32> -> tensor<2xf32> + %9 = hal.tensor.export %6 "output0" : tensor<2xf32> -> !hal.buffer_view + %10 = hal.tensor.export %8 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %9, %10 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After CloneProducersIntoDispatchRegionsPass (iree-dispatch-creation-clone-producers-into-dispatch-regions) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32> + %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32> + %4 = flow.dispatch.region -> (tensor<2xf32>) { + %10 = tensor.empty() : tensor<2xf32> + %11 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%10 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %12 = math.absf %in : f32 + linalg.yield %12 : f32 + } -> tensor<2xf32> + flow.return %11 : tensor<2xf32> + } + %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32> + %6 = flow.dispatch.region -> (tensor<2xf32>) { + %10 = tensor.empty() : tensor<2xf32> + %11 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%10 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %12 = math.absf %in : f32 + linalg.yield %12 : f32 + } -> tensor<2xf32> + flow.return %11 : tensor<2xf32> + } + %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32> + %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view + %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %8, %9 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After CollapseDimensionsPass (iree-dispatch-creation-collapse-dimensions) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32> + %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32> + %4 = flow.dispatch.region -> (tensor<2xf32>) { + %10 = tensor.empty() : tensor<2xf32> + %11 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%10 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %12 = math.absf %in : f32 + linalg.yield %12 : f32 + } -> tensor<2xf32> + flow.return %11 : tensor<2xf32> + } + %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32> + %6 = flow.dispatch.region -> (tensor<2xf32>) { + %10 = tensor.empty() : tensor<2xf32> + %11 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%10 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %12 = math.absf %in : f32 + linalg.yield %12 : f32 + } -> tensor<2xf32> + flow.return %11 : tensor<2xf32> + } + %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32> + %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view + %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %8, %9 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After HoistUniformScalarComputePass (iree-dispatch-creation-hoist-uniform-scalar-compute) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32> + %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32> + %4 = flow.dispatch.region -> (tensor<2xf32>) { + %10 = tensor.empty() : tensor<2xf32> + %11 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%10 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %12 = math.absf %in : f32 + linalg.yield %12 : f32 + } -> tensor<2xf32> + flow.return %11 : tensor<2xf32> + } + %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32> + %6 = flow.dispatch.region -> (tensor<2xf32>) { + %10 = tensor.empty() : tensor<2xf32> + %11 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%10 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %12 = math.absf %in : f32 + linalg.yield %12 : f32 + } -> tensor<2xf32> + flow.return %11 : tensor<2xf32> + } + %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32> + %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view + %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %8, %9 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After FuseEncodingOpsIntoDispatchRegionsPass (iree-dispatch-creation-fuse-encoding-ops-into-dispatch-regions-pass) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32> + %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32> + %4 = flow.dispatch.region -> (tensor<2xf32>) { + %10 = tensor.empty() : tensor<2xf32> + %11 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%10 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %12 = math.absf %in : f32 + linalg.yield %12 : f32 + } -> tensor<2xf32> + flow.return %11 : tensor<2xf32> + } + %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32> + %6 = flow.dispatch.region -> (tensor<2xf32>) { + %10 = tensor.empty() : tensor<2xf32> + %11 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%10 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %12 = math.absf %in : f32 + linalg.yield %12 : f32 + } -> tensor<2xf32> + flow.return %11 : tensor<2xf32> + } + %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32> + %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view + %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %8, %9 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After ConvertEncodingToFlowPass (iree-dispatch-creation-convert-encoding-to-flow) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32> + %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32> + %4 = flow.dispatch.region -> (tensor<2xf32>) { + %10 = tensor.empty() : tensor<2xf32> + %11 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%10 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %12 = math.absf %in : f32 + linalg.yield %12 : f32 + } -> tensor<2xf32> + flow.return %11 : tensor<2xf32> + } + %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32> + %6 = flow.dispatch.region -> (tensor<2xf32>) { + %10 = tensor.empty() : tensor<2xf32> + %11 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%10 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %12 = math.absf %in : f32 + linalg.yield %12 : f32 + } -> tensor<2xf32> + flow.return %11 : tensor<2xf32> + } + %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32> + %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view + %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %8, %9 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After HoistIntoGlobalsPass (iree-util-hoist-into-globals) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = iree_tensor_ext.compute_barrier.start %0 : tensor<2xf32> -> tensor<2xf32> + %2 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %3 = iree_tensor_ext.compute_barrier.start %2 : tensor<2xf32> -> tensor<2xf32> + %4 = flow.dispatch.region -> (tensor<2xf32>) { + %10 = tensor.empty() : tensor<2xf32> + %11 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%10 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %12 = math.absf %in : f32 + linalg.yield %12 : f32 + } -> tensor<2xf32> + flow.return %11 : tensor<2xf32> + } + %5 = iree_tensor_ext.compute_barrier.end %4 : tensor<2xf32> -> tensor<2xf32> + %6 = flow.dispatch.region -> (tensor<2xf32>) { + %10 = tensor.empty() : tensor<2xf32> + %11 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%10 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %12 = math.absf %in : f32 + linalg.yield %12 : f32 + } -> tensor<2xf32> + flow.return %11 : tensor<2xf32> + } + %7 = iree_tensor_ext.compute_barrier.end %6 : tensor<2xf32> -> tensor<2xf32> + %8 = hal.tensor.export %5 "output0" : tensor<2xf32> -> !hal.buffer_view + %9 = hal.tensor.export %7 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %8, %9 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After RemoveTensorBarriersPass (iree-dispatch-creation-remove-tensor-barriers) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %2 = flow.dispatch.region -> (tensor<2xf32>) { + %6 = tensor.empty() : tensor<2xf32> + %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<2xf32>) outs(%6 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %8 = math.absf %in : f32 + linalg.yield %8 : f32 + } -> tensor<2xf32> + flow.return %7 : tensor<2xf32> + } + %3 = flow.dispatch.region -> (tensor<2xf32>) { + %6 = tensor.empty() : tensor<2xf32> + %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1 : tensor<2xf32>) outs(%6 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %8 = math.absf %in : f32 + linalg.yield %8 : f32 + } -> tensor<2xf32> + flow.return %7 : tensor<2xf32> + } + %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view + %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %4, %5 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After ConvertDispatchRegionsToWorkgroupsPass (iree-dispatch-creation-convert-dispatch-regions-to-workgroups) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %2 = flow.dispatch.workgroups(%0) : (tensor<2xf32>) -> tensor<2xf32> = + (%arg2: !iree_tensor_ext.dispatch.tensor>, %arg3: !iree_tensor_ext.dispatch.tensor>) { + %6 = iree_tensor_ext.dispatch.tensor.load %arg2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %7 = tensor.empty() : tensor<2xf32> + %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%6 : tensor<2xf32>) outs(%7 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %9 = math.absf %in : f32 + linalg.yield %9 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %8, %arg3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + flow.return + } + %3 = flow.dispatch.workgroups(%1) : (tensor<2xf32>) -> tensor<2xf32> = + (%arg2: !iree_tensor_ext.dispatch.tensor>, %arg3: !iree_tensor_ext.dispatch.tensor>) { + %6 = iree_tensor_ext.dispatch.tensor.load %arg2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %7 = tensor.empty() : tensor<2xf32> + %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%6 : tensor<2xf32>) outs(%7 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %9 = math.absf %in : f32 + linalg.yield %9 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %8, %arg3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + flow.return + } + %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view + %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %4, %5 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After ConvertTensorToFlowPass (iree-dispatch-creation-convert-tensor-to-flow) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %2 = flow.dispatch.workgroups(%0) : (tensor<2xf32>) -> tensor<2xf32> = + (%arg2: !iree_tensor_ext.dispatch.tensor>, %arg3: !iree_tensor_ext.dispatch.tensor>) { + %6 = iree_tensor_ext.dispatch.tensor.load %arg2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %7 = tensor.empty() : tensor<2xf32> + %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%6 : tensor<2xf32>) outs(%7 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %9 = math.absf %in : f32 + linalg.yield %9 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %8, %arg3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + flow.return + } + %3 = flow.dispatch.workgroups(%1) : (tensor<2xf32>) -> tensor<2xf32> = + (%arg2: !iree_tensor_ext.dispatch.tensor>, %arg3: !iree_tensor_ext.dispatch.tensor>) { + %6 = iree_tensor_ext.dispatch.tensor.load %arg2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %7 = tensor.empty() : tensor<2xf32> + %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%6 : tensor<2xf32>) outs(%7 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %9 = math.absf %in : f32 + linalg.yield %9 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %8, %arg3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + flow.return + } + %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view + %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %4, %5 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After CSE (cse) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %2 = flow.dispatch.workgroups(%0) : (tensor<2xf32>) -> tensor<2xf32> = + (%arg2: !iree_tensor_ext.dispatch.tensor>, %arg3: !iree_tensor_ext.dispatch.tensor>) { + %6 = iree_tensor_ext.dispatch.tensor.load %arg2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %7 = tensor.empty() : tensor<2xf32> + %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%6 : tensor<2xf32>) outs(%7 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %9 = math.absf %in : f32 + linalg.yield %9 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %8, %arg3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + flow.return + } + %3 = flow.dispatch.workgroups(%1) : (tensor<2xf32>) -> tensor<2xf32> = + (%arg2: !iree_tensor_ext.dispatch.tensor>, %arg3: !iree_tensor_ext.dispatch.tensor>) { + %6 = iree_tensor_ext.dispatch.tensor.load %arg2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %7 = tensor.empty() : tensor<2xf32> + %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%6 : tensor<2xf32>) outs(%7 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %9 = math.absf %in : f32 + linalg.yield %9 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %8, %arg3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + flow.return + } + %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view + %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %4, %5 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After CanonicalizePass (iree-flow-canonicalize) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %2 = flow.dispatch.workgroups(%0) : (tensor<2xf32>) -> tensor<2xf32> = + (%arg2: !iree_tensor_ext.dispatch.tensor>, %arg3: !iree_tensor_ext.dispatch.tensor>) { + %6 = iree_tensor_ext.dispatch.tensor.load %arg2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %7 = tensor.empty() : tensor<2xf32> + %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%6 : tensor<2xf32>) outs(%7 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %9 = math.absf %in : f32 + linalg.yield %9 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %8, %arg3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + flow.return + } + %3 = flow.dispatch.workgroups(%1) : (tensor<2xf32>) -> tensor<2xf32> = + (%arg2: !iree_tensor_ext.dispatch.tensor>, %arg3: !iree_tensor_ext.dispatch.tensor>) { + %6 = iree_tensor_ext.dispatch.tensor.load %arg2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %7 = tensor.empty() : tensor<2xf32> + %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%6 : tensor<2xf32>) outs(%7 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %9 = math.absf %in : f32 + linalg.yield %9 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %8, %arg3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + flow.return + } + %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view + %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %4, %5 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After MaterializeDefaultWorkgroupCountRegionPass (iree-dispatch-creation-materialize-default-workgroup-count-region) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %2 = flow.dispatch.workgroups(%0) : (tensor<2xf32>) -> tensor<2xf32> = + (%arg2: !iree_tensor_ext.dispatch.tensor>, %arg3: !iree_tensor_ext.dispatch.tensor>) { + %6 = iree_tensor_ext.dispatch.tensor.load %arg2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %7 = tensor.empty() : tensor<2xf32> + %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%6 : tensor<2xf32>) outs(%7 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %9 = math.absf %in : f32 + linalg.yield %9 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %8, %arg3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + flow.return + } count() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + flow.return %x, %y, %z : index, index, index + } + %3 = flow.dispatch.workgroups(%1) : (tensor<2xf32>) -> tensor<2xf32> = + (%arg2: !iree_tensor_ext.dispatch.tensor>, %arg3: !iree_tensor_ext.dispatch.tensor>) { + %6 = iree_tensor_ext.dispatch.tensor.load %arg2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %7 = tensor.empty() : tensor<2xf32> + %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%6 : tensor<2xf32>) outs(%7 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %9 = math.absf %in : f32 + linalg.yield %9 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %8, %arg3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + flow.return + } count() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + flow.return %x, %y, %z : index, index, index + } + %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view + %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %4, %5 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After BitcastUnsupportedElementTypesPass (iree-dispatch-creation-bitcast-unsupported-element-types) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %2 = flow.dispatch.workgroups(%0) : (tensor<2xf32>) -> tensor<2xf32> = + (%arg2: !iree_tensor_ext.dispatch.tensor>, %arg3: !iree_tensor_ext.dispatch.tensor>) { + %6 = iree_tensor_ext.dispatch.tensor.load %arg2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %7 = tensor.empty() : tensor<2xf32> + %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%6 : tensor<2xf32>) outs(%7 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %9 = math.absf %in : f32 + linalg.yield %9 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %8, %arg3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + flow.return + } count() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + flow.return %x, %y, %z : index, index, index + } + %3 = flow.dispatch.workgroups(%1) : (tensor<2xf32>) -> tensor<2xf32> = + (%arg2: !iree_tensor_ext.dispatch.tensor>, %arg3: !iree_tensor_ext.dispatch.tensor>) { + %6 = iree_tensor_ext.dispatch.tensor.load %arg2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %7 = tensor.empty() : tensor<2xf32> + %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%6 : tensor<2xf32>) outs(%7 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %9 = math.absf %in : f32 + linalg.yield %9 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %8, %arg3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + flow.return + } count() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + flow.return %x, %y, %z : index, index, index + } + %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view + %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %4, %5 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After CSE (cse) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %2 = flow.dispatch.workgroups(%0) : (tensor<2xf32>) -> tensor<2xf32> = + (%arg2: !iree_tensor_ext.dispatch.tensor>, %arg3: !iree_tensor_ext.dispatch.tensor>) { + %6 = iree_tensor_ext.dispatch.tensor.load %arg2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %7 = tensor.empty() : tensor<2xf32> + %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%6 : tensor<2xf32>) outs(%7 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %9 = math.absf %in : f32 + linalg.yield %9 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %8, %arg3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + flow.return + } count() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + flow.return %x, %y, %z : index, index, index + } + %3 = flow.dispatch.workgroups(%1) : (tensor<2xf32>) -> tensor<2xf32> = + (%arg2: !iree_tensor_ext.dispatch.tensor>, %arg3: !iree_tensor_ext.dispatch.tensor>) { + %6 = iree_tensor_ext.dispatch.tensor.load %arg2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %7 = tensor.empty() : tensor<2xf32> + %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%6 : tensor<2xf32>) outs(%7 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %9 = math.absf %in : f32 + linalg.yield %9 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %8, %arg3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + flow.return + } count() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + flow.return %x, %y, %z : index, index, index + } + %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view + %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %4, %5 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After CanonicalizePass (iree-flow-canonicalize) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %2 = flow.dispatch.workgroups(%0) : (tensor<2xf32>) -> tensor<2xf32> = + (%arg2: !iree_tensor_ext.dispatch.tensor>, %arg3: !iree_tensor_ext.dispatch.tensor>) { + %6 = iree_tensor_ext.dispatch.tensor.load %arg2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %7 = tensor.empty() : tensor<2xf32> + %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%6 : tensor<2xf32>) outs(%7 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %9 = math.absf %in : f32 + linalg.yield %9 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %8, %arg3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + flow.return + } count() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + flow.return %x, %y, %z : index, index, index + } + %3 = flow.dispatch.workgroups(%1) : (tensor<2xf32>) -> tensor<2xf32> = + (%arg2: !iree_tensor_ext.dispatch.tensor>, %arg3: !iree_tensor_ext.dispatch.tensor>) { + %6 = iree_tensor_ext.dispatch.tensor.load %arg2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %7 = tensor.empty() : tensor<2xf32> + %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%6 : tensor<2xf32>) outs(%7 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %9 = math.absf %in : f32 + linalg.yield %9 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %8, %arg3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + flow.return + } count() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + flow.return %x, %y, %z : index, index, index + } + %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view + %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %4, %5 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After VerifyInputLegalityPass (iree-verify-input-legality) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %2 = flow.dispatch.workgroups(%0) : (tensor<2xf32>) -> tensor<2xf32> = + (%arg2: !iree_tensor_ext.dispatch.tensor>, %arg3: !iree_tensor_ext.dispatch.tensor>) { + %6 = iree_tensor_ext.dispatch.tensor.load %arg2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %7 = tensor.empty() : tensor<2xf32> + %8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%6 : tensor<2xf32>) outs(%7 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %9 = math.absf %in : f32 + linalg.yield %9 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %8, %arg3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + flow.return + } count() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + flow.return %x, %y, %z : index, index, index + } + %3 = flow.dispatch.workgroups(%1) : (tensor<2xf32>) -> tensor<2xf32> = + (%arg2: !iree_tensor_ext.dispatch.tensor>, %arg3: !iree_tensor_ext.dispatch.tensor>) { + %6 = iree_tensor_ext.dispatch.tensor.load %arg2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %7 = tensor.empty() : tensor<2xf32> + %8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%6 : tensor<2xf32>) outs(%7 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %9 = math.absf %in : f32 + linalg.yield %9 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %8, %arg3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + flow.return + } count() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + flow.return %x, %y, %z : index, index, index + } + %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view + %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %4, %5 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After VerifyInitializationOrderPass (iree-util-verify-initialization-order) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %2 = flow.dispatch.workgroups(%0) : (tensor<2xf32>) -> tensor<2xf32> = + (%arg2: !iree_tensor_ext.dispatch.tensor>, %arg3: !iree_tensor_ext.dispatch.tensor>) { + %6 = iree_tensor_ext.dispatch.tensor.load %arg2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %7 = tensor.empty() : tensor<2xf32> + %8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%6 : tensor<2xf32>) outs(%7 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %9 = math.absf %in : f32 + linalg.yield %9 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %8, %arg3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + flow.return + } count() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + flow.return %x, %y, %z : index, index, index + } + %3 = flow.dispatch.workgroups(%1) : (tensor<2xf32>) -> tensor<2xf32> = + (%arg2: !iree_tensor_ext.dispatch.tensor>, %arg3: !iree_tensor_ext.dispatch.tensor>) { + %6 = iree_tensor_ext.dispatch.tensor.load %arg2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %7 = tensor.empty() : tensor<2xf32> + %8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%6 : tensor<2xf32>) outs(%7 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %9 = math.absf %in : f32 + linalg.yield %9 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %8, %arg3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + flow.return + } count() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + flow.return %x, %y, %z : index, index, index + } + %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view + %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %4, %5 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After AttributeCallGraphPass (iree-util-attribute-call-graph) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %2 = flow.dispatch.workgroups(%0) : (tensor<2xf32>) -> tensor<2xf32> = + (%arg2: !iree_tensor_ext.dispatch.tensor>, %arg3: !iree_tensor_ext.dispatch.tensor>) { + %6 = iree_tensor_ext.dispatch.tensor.load %arg2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %7 = tensor.empty() : tensor<2xf32> + %8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%6 : tensor<2xf32>) outs(%7 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %9 = math.absf %in : f32 + linalg.yield %9 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %8, %arg3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + flow.return + } count() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + flow.return %x, %y, %z : index, index, index + } + %3 = flow.dispatch.workgroups(%1) : (tensor<2xf32>) -> tensor<2xf32> = + (%arg2: !iree_tensor_ext.dispatch.tensor>, %arg3: !iree_tensor_ext.dispatch.tensor>) { + %6 = iree_tensor_ext.dispatch.tensor.load %arg2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %7 = tensor.empty() : tensor<2xf32> + %8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%6 : tensor<2xf32>) outs(%7 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %9 = math.absf %in : f32 + linalg.yield %9 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %8, %arg3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + flow.return + } count() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + flow.return %x, %y, %z : index, index, index + } + %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view + %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %4, %5 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After InitializeEmptyTensorsPass (iree-flow-initialize-empty-tensors) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %2 = flow.dispatch.workgroups(%0) : (tensor<2xf32>) -> tensor<2xf32> = + (%arg2: !iree_tensor_ext.dispatch.tensor>, %arg3: !iree_tensor_ext.dispatch.tensor>) { + %6 = iree_tensor_ext.dispatch.tensor.load %arg2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %7 = tensor.empty() : tensor<2xf32> + %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%6 : tensor<2xf32>) outs(%7 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %9 = math.absf %in : f32 + linalg.yield %9 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %8, %arg3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + flow.return + } count() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + flow.return %x, %y, %z : index, index, index + } + %3 = flow.dispatch.workgroups(%1) : (tensor<2xf32>) -> tensor<2xf32> = + (%arg2: !iree_tensor_ext.dispatch.tensor>, %arg3: !iree_tensor_ext.dispatch.tensor>) { + %6 = iree_tensor_ext.dispatch.tensor.load %arg2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %7 = tensor.empty() : tensor<2xf32> + %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%6 : tensor<2xf32>) outs(%7 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %9 = math.absf %in : f32 + linalg.yield %9 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %8, %arg3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + flow.return + } count() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + flow.return %x, %y, %z : index, index, index + } + %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view + %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %4, %5 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After CaptureDynamicDimsPass (iree-flow-capture-dynamic-dims) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %2 = flow.dispatch.workgroups(%0) : (tensor<2xf32>) -> tensor<2xf32> = + (%arg2: !iree_tensor_ext.dispatch.tensor>, %arg3: !iree_tensor_ext.dispatch.tensor>) { + %6 = iree_tensor_ext.dispatch.tensor.load %arg2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %7 = tensor.empty() : tensor<2xf32> + %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%6 : tensor<2xf32>) outs(%7 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %9 = math.absf %in : f32 + linalg.yield %9 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %8, %arg3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + flow.return + } count() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + flow.return %x, %y, %z : index, index, index + } + %3 = flow.dispatch.workgroups(%1) : (tensor<2xf32>) -> tensor<2xf32> = + (%arg2: !iree_tensor_ext.dispatch.tensor>, %arg3: !iree_tensor_ext.dispatch.tensor>) { + %6 = iree_tensor_ext.dispatch.tensor.load %arg2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %7 = tensor.empty() : tensor<2xf32> + %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%6 : tensor<2xf32>) outs(%7 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %9 = math.absf %in : f32 + linalg.yield %9 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %8, %arg3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + flow.return + } count() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + flow.return %x, %y, %z : index, index, index + } + %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view + %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %4, %5 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After CanonicalizePass (iree-flow-canonicalize) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %2 = flow.dispatch.workgroups(%0) : (tensor<2xf32>) -> tensor<2xf32> = + (%arg2: !iree_tensor_ext.dispatch.tensor>, %arg3: !iree_tensor_ext.dispatch.tensor>) { + %6 = iree_tensor_ext.dispatch.tensor.load %arg2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %7 = tensor.empty() : tensor<2xf32> + %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%6 : tensor<2xf32>) outs(%7 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %9 = math.absf %in : f32 + linalg.yield %9 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %8, %arg3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + flow.return + } count() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + flow.return %x, %y, %z : index, index, index + } + %3 = flow.dispatch.workgroups(%1) : (tensor<2xf32>) -> tensor<2xf32> = + (%arg2: !iree_tensor_ext.dispatch.tensor>, %arg3: !iree_tensor_ext.dispatch.tensor>) { + %6 = iree_tensor_ext.dispatch.tensor.load %arg2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %7 = tensor.empty() : tensor<2xf32> + %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%6 : tensor<2xf32>) outs(%7 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %9 = math.absf %in : f32 + linalg.yield %9 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %8, %arg3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + flow.return + } count() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + flow.return %x, %y, %z : index, index, index + } + %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view + %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %4, %5 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After CSE (cse) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %2 = flow.dispatch.workgroups(%0) : (tensor<2xf32>) -> tensor<2xf32> = + (%arg2: !iree_tensor_ext.dispatch.tensor>, %arg3: !iree_tensor_ext.dispatch.tensor>) { + %6 = iree_tensor_ext.dispatch.tensor.load %arg2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %7 = tensor.empty() : tensor<2xf32> + %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%6 : tensor<2xf32>) outs(%7 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %9 = math.absf %in : f32 + linalg.yield %9 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %8, %arg3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + flow.return + } count() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + flow.return %x, %y, %z : index, index, index + } + %3 = flow.dispatch.workgroups(%1) : (tensor<2xf32>) -> tensor<2xf32> = + (%arg2: !iree_tensor_ext.dispatch.tensor>, %arg3: !iree_tensor_ext.dispatch.tensor>) { + %6 = iree_tensor_ext.dispatch.tensor.load %arg2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %7 = tensor.empty() : tensor<2xf32> + %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%6 : tensor<2xf32>) outs(%7 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %9 = math.absf %in : f32 + linalg.yield %9 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %8, %arg3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + flow.return + } count() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + flow.return %x, %y, %z : index, index, index + } + %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view + %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %4, %5 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After OutlineDispatchExternsPass (iree-flow-outline-dispatch-externs) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %2 = flow.dispatch.workgroups(%0) : (tensor<2xf32>) -> tensor<2xf32> = + (%arg2: !iree_tensor_ext.dispatch.tensor>, %arg3: !iree_tensor_ext.dispatch.tensor>) { + %6 = iree_tensor_ext.dispatch.tensor.load %arg2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %7 = tensor.empty() : tensor<2xf32> + %8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%6 : tensor<2xf32>) outs(%7 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %9 = math.absf %in : f32 + linalg.yield %9 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %8, %arg3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + flow.return + } count() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + flow.return %x, %y, %z : index, index, index + } + %3 = flow.dispatch.workgroups(%1) : (tensor<2xf32>) -> tensor<2xf32> = + (%arg2: !iree_tensor_ext.dispatch.tensor>, %arg3: !iree_tensor_ext.dispatch.tensor>) { + %6 = iree_tensor_ext.dispatch.tensor.load %arg2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %7 = tensor.empty() : tensor<2xf32> + %8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%6 : tensor<2xf32>) outs(%7 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %9 = math.absf %in : f32 + linalg.yield %9 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %8, %arg3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + flow.return + } count() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + flow.return %x, %y, %z : index, index, index + } + %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view + %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %4, %5 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After OutlineDispatchRegionsPass (iree-flow-outline-dispatch-regions) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + flow.executable private @multiple_results_dispatch_0 { + flow.executable.export public @multiple_results_dispatch_0 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + flow.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0(%arg0: !iree_tensor_ext.dispatch.tensor>, %arg1: !iree_tensor_ext.dispatch.tensor>) { + %0 = iree_tensor_ext.dispatch.tensor.load %arg0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %1 = tensor.empty() : tensor<2xf32> + %2 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%0 : tensor<2xf32>) outs(%1 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %3 = math.absf %in : f32 + linalg.yield %3 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %2, %arg1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + flow.executable private @multiple_results_dispatch_1 { + flow.executable.export public @multiple_results_dispatch_1 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + flow.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_1(%arg0: !iree_tensor_ext.dispatch.tensor>, %arg1: !iree_tensor_ext.dispatch.tensor>) { + %0 = iree_tensor_ext.dispatch.tensor.load %arg0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %1 = tensor.empty() : tensor<2xf32> + %2 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%0 : tensor<2xf32>) outs(%1 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %3 = math.absf %in : f32 + linalg.yield %3 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %2, %arg1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %2 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0(%0) : (tensor<2xf32>) -> tensor<2xf32> + %3 = flow.dispatch @multiple_results_dispatch_1::@multiple_results_dispatch_1(%1) : (tensor<2xf32>) -> tensor<2xf32> + %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view + %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %4, %5 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After AnnotateDispatchesPass (iree-flow-annotate-dispatches) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + flow.executable private @multiple_results_dispatch_0 { + flow.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + flow.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !iree_tensor_ext.dispatch.tensor>, %arg1: !iree_tensor_ext.dispatch.tensor>) { + %0 = iree_tensor_ext.dispatch.tensor.load %arg0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %1 = tensor.empty() : tensor<2xf32> + %2 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%0 : tensor<2xf32>) outs(%1 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %3 = math.absf %in : f32 + linalg.yield %3 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %2, %arg1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + flow.executable private @multiple_results_dispatch_1 { + flow.executable.export public @multiple_results_dispatch_1_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + flow.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_1_elementwise_2_f32(%arg0: !iree_tensor_ext.dispatch.tensor>, %arg1: !iree_tensor_ext.dispatch.tensor>) { + %0 = iree_tensor_ext.dispatch.tensor.load %arg0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %1 = tensor.empty() : tensor<2xf32> + %2 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%0 : tensor<2xf32>) outs(%1 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %3 = math.absf %in : f32 + linalg.yield %3 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %2, %arg1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %2 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%0) : (tensor<2xf32>) -> tensor<2xf32> + %3 = flow.dispatch @multiple_results_dispatch_1::@multiple_results_dispatch_1_elementwise_2_f32(%1) : (tensor<2xf32>) -> tensor<2xf32> + %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view + %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %4, %5 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After StripDebugOpsPass (iree-util-strip-debug-ops) //----- // +flow.executable private @multiple_results_dispatch_1 { + flow.executable.export public @multiple_results_dispatch_1_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + flow.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_1_elementwise_2_f32(%arg0: !iree_tensor_ext.dispatch.tensor>, %arg1: !iree_tensor_ext.dispatch.tensor>) { + %0 = iree_tensor_ext.dispatch.tensor.load %arg0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %1 = tensor.empty() : tensor<2xf32> + %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<2xf32>) outs(%1 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %3 = math.absf %in : f32 + linalg.yield %3 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %2, %arg1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } +} + +// -----// IR Dump After CanonicalizePass (iree-flow-canonicalize) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %2 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%0) : (tensor<2xf32>) -> tensor<2xf32> + %3 = flow.dispatch @multiple_results_dispatch_1::@multiple_results_dispatch_1_elementwise_2_f32(%1) : (tensor<2xf32>) -> tensor<2xf32> + %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view + %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %4, %5 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After StripDebugOpsPass (iree-util-strip-debug-ops) //----- // +flow.executable private @multiple_results_dispatch_0 { + flow.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + flow.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !iree_tensor_ext.dispatch.tensor>, %arg1: !iree_tensor_ext.dispatch.tensor>) { + %0 = iree_tensor_ext.dispatch.tensor.load %arg0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %1 = tensor.empty() : tensor<2xf32> + %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<2xf32>) outs(%1 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %3 = math.absf %in : f32 + linalg.yield %3 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %2, %arg1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } +} + +// -----// IR Dump After DeduplicateExecutablesPass (iree-flow-deduplicate-executables) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + flow.executable private @multiple_results_dispatch_0 { + flow.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + flow.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !iree_tensor_ext.dispatch.tensor>, %arg1: !iree_tensor_ext.dispatch.tensor>) { + %0 = iree_tensor_ext.dispatch.tensor.load %arg0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %1 = tensor.empty() : tensor<2xf32> + %2 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%0 : tensor<2xf32>) outs(%1 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %3 = math.absf %in : f32 + linalg.yield %3 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %2, %arg1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %2 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%0) : (tensor<2xf32>) -> tensor<2xf32> + %3 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1) : (tensor<2xf32>) -> tensor<2xf32> + %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view + %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %4, %5 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After InjectTensorTracingPass (iree-flow-inject-tensor-tracing) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %2 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%0) : (tensor<2xf32>) -> tensor<2xf32> + %3 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1) : (tensor<2xf32>) -> tensor<2xf32> + %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view + %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %4, %5 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After CleanupTensorShapesPass (iree-flow-cleanup-tensor-shapes) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %2 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%0) : (tensor<2xf32>) -> tensor<2xf32> + %3 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1) : (tensor<2xf32>) -> tensor<2xf32> + %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view + %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %4, %5 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After OutlineConstantsPass (iree-flow-outline-constants) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + flow.executable private @multiple_results_dispatch_0 { + flow.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + flow.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !iree_tensor_ext.dispatch.tensor>, %arg1: !iree_tensor_ext.dispatch.tensor>) { + %0 = iree_tensor_ext.dispatch.tensor.load %arg0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %1 = tensor.empty() : tensor<2xf32> + %2 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%0 : tensor<2xf32>) outs(%1 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %3 = math.absf %in : f32 + linalg.yield %3 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %2, %arg1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %2 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%0) : (tensor<2xf32>) -> tensor<2xf32> + %3 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1) : (tensor<2xf32>) -> tensor<2xf32> + %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view + %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %4, %5 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %2 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%0) : (tensor<2xf32>) -> tensor<2xf32> + %3 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1) : (tensor<2xf32>) -> tensor<2xf32> + %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view + %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %4, %5 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After CanonicalizePass (iree-flow-canonicalize) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %2 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%0) : (tensor<2xf32>) -> tensor<2xf32> + %3 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1) : (tensor<2xf32>) -> tensor<2xf32> + %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view + %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %4, %5 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After CSE (cse) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %2 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%0) : (tensor<2xf32>) -> tensor<2xf32> + %3 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1) : (tensor<2xf32>) -> tensor<2xf32> + %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view + %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %4, %5 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %2 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%0) : (tensor<2xf32>) -> tensor<2xf32> + %3 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1) : (tensor<2xf32>) -> tensor<2xf32> + %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view + %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %4, %5 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %2 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%0) : (tensor<2xf32>) -> tensor<2xf32> + %3 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1) : (tensor<2xf32>) -> tensor<2xf32> + %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view + %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %4, %5 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + flow.executable private @multiple_results_dispatch_0 { + flow.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + flow.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !iree_tensor_ext.dispatch.tensor>, %arg1: !iree_tensor_ext.dispatch.tensor>) { + %0 = iree_tensor_ext.dispatch.tensor.load %arg0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %1 = tensor.empty() : tensor<2xf32> + %2 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%0 : tensor<2xf32>) outs(%1 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %3 = math.absf %in : f32 + linalg.yield %3 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %2, %arg1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %2 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%0) : (tensor<2xf32>) -> tensor<2xf32> + %3 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1) : (tensor<2xf32>) -> tensor<2xf32> + %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view + %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %4, %5 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + flow.executable private @multiple_results_dispatch_0 { + flow.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + flow.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !iree_tensor_ext.dispatch.tensor>, %arg1: !iree_tensor_ext.dispatch.tensor>) { + %0 = iree_tensor_ext.dispatch.tensor.load %arg0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %1 = tensor.empty() : tensor<2xf32> + %2 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%0 : tensor<2xf32>) outs(%1 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %3 = math.absf %in : f32 + linalg.yield %3 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %2, %arg1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %2 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%0) : (tensor<2xf32>) -> tensor<2xf32> + %3 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1) : (tensor<2xf32>) -> tensor<2xf32> + %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view + %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %4, %5 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After IPOPass (iree-util-ipo) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + flow.executable private @multiple_results_dispatch_0 { + flow.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + flow.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !iree_tensor_ext.dispatch.tensor>, %arg1: !iree_tensor_ext.dispatch.tensor>) { + %0 = iree_tensor_ext.dispatch.tensor.load %arg0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %1 = tensor.empty() : tensor<2xf32> + %2 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%0 : tensor<2xf32>) outs(%1 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %3 = math.absf %in : f32 + linalg.yield %3 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %2, %arg1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %2 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%0) : (tensor<2xf32>) -> tensor<2xf32> + %3 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1) : (tensor<2xf32>) -> tensor<2xf32> + %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view + %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %4, %5 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After FixedPointIteratorPass (iree-util-fixed-point-iterator) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + flow.executable private @multiple_results_dispatch_0 { + flow.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + flow.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !iree_tensor_ext.dispatch.tensor>, %arg1: !iree_tensor_ext.dispatch.tensor>) { + %0 = iree_tensor_ext.dispatch.tensor.load %arg0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %1 = tensor.empty() : tensor<2xf32> + %2 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%0 : tensor<2xf32>) outs(%1 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %3 = math.absf %in : f32 + linalg.yield %3 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %2, %arg1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %2 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%0) : (tensor<2xf32>) -> tensor<2xf32> + %3 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1) : (tensor<2xf32>) -> tensor<2xf32> + %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view + %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %4, %5 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After SymbolDCE (symbol-dce) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + flow.executable private @multiple_results_dispatch_0 { + flow.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + flow.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !iree_tensor_ext.dispatch.tensor>, %arg1: !iree_tensor_ext.dispatch.tensor>) { + %0 = iree_tensor_ext.dispatch.tensor.load %arg0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %1 = tensor.empty() : tensor<2xf32> + %2 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%0 : tensor<2xf32>) outs(%1 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %3 = math.absf %in : f32 + linalg.yield %3 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %2, %arg1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %2 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%0) : (tensor<2xf32>) -> tensor<2xf32> + %3 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1) : (tensor<2xf32>) -> tensor<2xf32> + %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view + %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %4, %5 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After VerifyInitializationOrderPass (iree-util-verify-initialization-order) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + flow.executable private @multiple_results_dispatch_0 { + flow.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + flow.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !iree_tensor_ext.dispatch.tensor>, %arg1: !iree_tensor_ext.dispatch.tensor>) { + %0 = iree_tensor_ext.dispatch.tensor.load %arg0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %1 = tensor.empty() : tensor<2xf32> + %2 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%0 : tensor<2xf32>) outs(%1 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %3 = math.absf %in : f32 + linalg.yield %3 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %2, %arg1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %2 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%0) : (tensor<2xf32>) -> tensor<2xf32> + %3 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1) : (tensor<2xf32>) -> tensor<2xf32> + %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view + %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %4, %5 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After VerifyInputPass (iree-stream-verify-input) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + flow.executable private @multiple_results_dispatch_0 { + flow.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + flow.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !iree_tensor_ext.dispatch.tensor>, %arg1: !iree_tensor_ext.dispatch.tensor>) { + %0 = iree_tensor_ext.dispatch.tensor.load %arg0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %1 = tensor.empty() : tensor<2xf32> + %2 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%0 : tensor<2xf32>) outs(%1 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %3 = math.absf %in : f32 + linalg.yield %3 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %2, %arg1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %2 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%0) : (tensor<2xf32>) -> tensor<2xf32> + %3 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1) : (tensor<2xf32>) -> tensor<2xf32> + %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view + %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %4, %5 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After Canonicalizer (canonicalize) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %2 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%0) : (tensor<2xf32>) -> tensor<2xf32> + %3 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1) : (tensor<2xf32>) -> tensor<2xf32> + %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view + %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %4, %5 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After CSE (cse) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %2 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%0) : (tensor<2xf32>) -> tensor<2xf32> + %3 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1) : (tensor<2xf32>) -> tensor<2xf32> + %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view + %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %4, %5 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %2 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%0) : (tensor<2xf32>) -> tensor<2xf32> + %3 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1) : (tensor<2xf32>) -> tensor<2xf32> + %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view + %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %4, %5 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %2 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%0) : (tensor<2xf32>) -> tensor<2xf32> + %3 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1) : (tensor<2xf32>) -> tensor<2xf32> + %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view + %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %4, %5 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %2 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%0) : (tensor<2xf32>) -> tensor<2xf32> + %3 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1) : (tensor<2xf32>) -> tensor<2xf32> + %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view + %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %4, %5 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + flow.executable private @multiple_results_dispatch_0 { + flow.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + flow.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !iree_tensor_ext.dispatch.tensor>, %arg1: !iree_tensor_ext.dispatch.tensor>) { + %0 = iree_tensor_ext.dispatch.tensor.load %arg0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %1 = tensor.empty() : tensor<2xf32> + %2 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%0 : tensor<2xf32>) outs(%1 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %3 = math.absf %in : f32 + linalg.yield %3 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %2, %arg1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %2 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%0) : (tensor<2xf32>) -> tensor<2xf32> + %3 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1) : (tensor<2xf32>) -> tensor<2xf32> + %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view + %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %4, %5 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + flow.executable private @multiple_results_dispatch_0 { + flow.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + flow.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !iree_tensor_ext.dispatch.tensor>, %arg1: !iree_tensor_ext.dispatch.tensor>) { + %0 = iree_tensor_ext.dispatch.tensor.load %arg0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %1 = tensor.empty() : tensor<2xf32> + %2 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%0 : tensor<2xf32>) outs(%1 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %3 = math.absf %in : f32 + linalg.yield %3 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %2, %arg1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %2 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%0) : (tensor<2xf32>) -> tensor<2xf32> + %3 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1) : (tensor<2xf32>) -> tensor<2xf32> + %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view + %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %4, %5 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After IPOPass (iree-util-ipo) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + flow.executable private @multiple_results_dispatch_0 { + flow.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + flow.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !iree_tensor_ext.dispatch.tensor>, %arg1: !iree_tensor_ext.dispatch.tensor>) { + %0 = iree_tensor_ext.dispatch.tensor.load %arg0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %1 = tensor.empty() : tensor<2xf32> + %2 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%0 : tensor<2xf32>) outs(%1 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %3 = math.absf %in : f32 + linalg.yield %3 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %2, %arg1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %2 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%0) : (tensor<2xf32>) -> tensor<2xf32> + %3 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1) : (tensor<2xf32>) -> tensor<2xf32> + %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view + %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %4, %5 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After CloneToConsumersPass (iree-stream-clone-to-consumers) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + flow.executable private @multiple_results_dispatch_0 { + flow.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + flow.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !iree_tensor_ext.dispatch.tensor>, %arg1: !iree_tensor_ext.dispatch.tensor>) { + %0 = iree_tensor_ext.dispatch.tensor.load %arg0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %1 = tensor.empty() : tensor<2xf32> + %2 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%0 : tensor<2xf32>) outs(%1 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %3 = math.absf %in : f32 + linalg.yield %3 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %2, %arg1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2xf32> + %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2xf32> + %2 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%0) : (tensor<2xf32>) -> tensor<2xf32> + %3 = flow.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1) : (tensor<2xf32>) -> tensor<2xf32> + %4 = hal.tensor.export %2 "output0" : tensor<2xf32> -> !hal.buffer_view + %5 = hal.tensor.export %3 "output1" : tensor<2xf32> -> !hal.buffer_view + util.return %4, %5 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After ConvertToStreamPass (iree-stream-conversion) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + stream.executable private @multiple_results_dispatch_0 { + stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + stream.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) { + %c0 = arith.constant 0 : index + %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %3 = tensor.empty() : tensor<2xf32> + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %5 = math.absf %in : f32 + linalg.yield %5 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + %c2 = arith.constant 2 : index + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xf32> : index + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%0} + %2 = stream.async.transfer %1 : !stream.resource{%0} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%0} + %element_type_f32_0 = hal.element_type : i32 + %dense_row_major_1 = hal.encoding_type : i32 + %c2_2 = arith.constant 2 : index + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2_2]) type(%element_type_f32_0) encoding(%dense_row_major_1) + %3 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xf32> : index + %4 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%3} + %5 = stream.async.transfer %4 : !stream.resource{%3} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%3} + %6 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xf32> : index + %7 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%2) : (tensor<2xf32> in !stream.resource<*>{%0}) -> tensor<2xf32> in !stream.resource<*>{%6} + %8 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xf32> : index + %9 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%5) : (tensor<2xf32> in !stream.resource<*>{%3}) -> tensor<2xf32> in !stream.resource<*>{%8} + %10 = stream.async.transfer %7 : !stream.resource<*>{%6} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource{%6} + %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2xf32> in !stream.resource{%6} -> !hal.buffer_view + %12 = stream.async.transfer %9 : !stream.resource<*>{%8} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource{%8} + %13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12 : tensor<2xf32> in !stream.resource{%8} -> !hal.buffer_view + util.return %11, %13 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After VerifyLoweringToTensorsPass (iree-stream-verify-lowering-to-tensors) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + stream.executable private @multiple_results_dispatch_0 { + stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + stream.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) { + %c0 = arith.constant 0 : index + %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %3 = tensor.empty() : tensor<2xf32> + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %5 = math.absf %in : f32 + linalg.yield %5 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + %c2 = arith.constant 2 : index + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xf32> : index + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%0} + %2 = stream.async.transfer %1 : !stream.resource{%0} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%0} + %element_type_f32_0 = hal.element_type : i32 + %dense_row_major_1 = hal.encoding_type : i32 + %c2_2 = arith.constant 2 : index + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2_2]) type(%element_type_f32_0) encoding(%dense_row_major_1) + %3 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xf32> : index + %4 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%3} + %5 = stream.async.transfer %4 : !stream.resource{%3} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%3} + %6 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xf32> : index + %7 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%2) : (tensor<2xf32> in !stream.resource<*>{%0}) -> tensor<2xf32> in !stream.resource<*>{%6} + %8 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xf32> : index + %9 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%5) : (tensor<2xf32> in !stream.resource<*>{%3}) -> tensor<2xf32> in !stream.resource<*>{%8} + %10 = stream.async.transfer %7 : !stream.resource<*>{%6} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource{%6} + %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2xf32> in !stream.resource{%6} -> !hal.buffer_view + %12 = stream.async.transfer %9 : !stream.resource<*>{%8} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource{%8} + %13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12 : tensor<2xf32> in !stream.resource{%8} -> !hal.buffer_view + util.return %11, %13 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After Canonicalizer (canonicalize) //----- // +func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) { + %c0 = arith.constant 0 : index + %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %3 = tensor.empty() : tensor<2xf32> + %4 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %5 = math.absf %in : f32 + linalg.yield %5 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return +} + +// -----// IR Dump After Canonicalizer (canonicalize) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xf32> : index + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%0} + %2 = stream.async.clone on(#hal.device.affinity<@__device_0>) %1 : !stream.resource{%0} -> !stream.resource<*>{%0} + %element_type_f32_0 = hal.element_type : i32 + %dense_row_major_1 = hal.encoding_type : i32 + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32_0) encoding(%dense_row_major_1) + %3 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xf32> : index + %4 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%3} + %5 = stream.async.clone on(#hal.device.affinity<@__device_0>) %4 : !stream.resource{%3} -> !stream.resource<*>{%3} + %6 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xf32> : index + %7 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%2) : (tensor<2xf32> in !stream.resource<*>{%0}) -> tensor<2xf32> in !stream.resource<*>{%6} + %8 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xf32> : index + %9 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%5) : (tensor<2xf32> in !stream.resource<*>{%3}) -> tensor<2xf32> in !stream.resource<*>{%8} + %10 = stream.async.clone on(#hal.device.affinity<@__device_0>) %7 : !stream.resource<*>{%6} -> !stream.resource{%6} + %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2xf32> in !stream.resource{%6} -> !hal.buffer_view + %12 = stream.async.clone on(#hal.device.affinity<@__device_0>) %9 : !stream.resource<*>{%8} -> !stream.resource{%8} + %13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12 : tensor<2xf32> in !stream.resource{%8} -> !hal.buffer_view + util.return %11, %13 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After Inliner (inline) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + stream.executable private @multiple_results_dispatch_0 { + stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + stream.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) { + %c0 = arith.constant 0 : index + %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %3 = tensor.empty() : tensor<2xf32> + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %5 = math.absf %in : f32 + linalg.yield %5 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xf32> : index + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%0} + %2 = stream.async.clone on(#hal.device.affinity<@__device_0>) %1 : !stream.resource{%0} -> !stream.resource<*>{%0} + %element_type_f32_0 = hal.element_type : i32 + %dense_row_major_1 = hal.encoding_type : i32 + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32_0) encoding(%dense_row_major_1) + %3 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xf32> : index + %4 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%3} + %5 = stream.async.clone on(#hal.device.affinity<@__device_0>) %4 : !stream.resource{%3} -> !stream.resource<*>{%3} + %6 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xf32> : index + %7 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%2) : (tensor<2xf32> in !stream.resource<*>{%0}) -> tensor<2xf32> in !stream.resource<*>{%6} + %8 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xf32> : index + %9 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%5) : (tensor<2xf32> in !stream.resource<*>{%3}) -> tensor<2xf32> in !stream.resource<*>{%8} + %10 = stream.async.clone on(#hal.device.affinity<@__device_0>) %7 : !stream.resource<*>{%6} -> !stream.resource{%6} + %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2xf32> in !stream.resource{%6} -> !hal.buffer_view + %12 = stream.async.clone on(#hal.device.affinity<@__device_0>) %9 : !stream.resource<*>{%8} -> !stream.resource{%8} + %13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12 : tensor<2xf32> in !stream.resource{%8} -> !hal.buffer_view + util.return %11, %13 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After Canonicalizer (canonicalize) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xf32> : index + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%0} + %2 = stream.async.clone on(#hal.device.affinity<@__device_0>) %1 : !stream.resource{%0} -> !stream.resource<*>{%0} + %element_type_f32_0 = hal.element_type : i32 + %dense_row_major_1 = hal.encoding_type : i32 + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32_0) encoding(%dense_row_major_1) + %3 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xf32> : index + %4 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%3} + %5 = stream.async.clone on(#hal.device.affinity<@__device_0>) %4 : !stream.resource{%3} -> !stream.resource<*>{%3} + %6 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xf32> : index + %7 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%2) : (tensor<2xf32> in !stream.resource<*>{%0}) -> tensor<2xf32> in !stream.resource<*>{%6} + %8 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xf32> : index + %9 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%5) : (tensor<2xf32> in !stream.resource<*>{%3}) -> tensor<2xf32> in !stream.resource<*>{%8} + %10 = stream.async.clone on(#hal.device.affinity<@__device_0>) %7 : !stream.resource<*>{%6} -> !stream.resource{%6} + %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2xf32> in !stream.resource{%6} -> !hal.buffer_view + %12 = stream.async.clone on(#hal.device.affinity<@__device_0>) %9 : !stream.resource<*>{%8} -> !stream.resource{%8} + %13 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %12 : tensor<2xf32> in !stream.resource{%8} -> !hal.buffer_view + util.return %11, %13 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After CSE (cse) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xf32> : index + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%0} + %2 = stream.async.clone on(#hal.device.affinity<@__device_0>) %1 : !stream.resource{%0} -> !stream.resource<*>{%0} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %3 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%0} + %4 = stream.async.clone on(#hal.device.affinity<@__device_0>) %3 : !stream.resource{%0} -> !stream.resource<*>{%0} + %5 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%2) : (tensor<2xf32> in !stream.resource<*>{%0}) -> tensor<2xf32> in !stream.resource<*>{%0} + %6 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%4) : (tensor<2xf32> in !stream.resource<*>{%0}) -> tensor<2xf32> in !stream.resource<*>{%0} + %7 = stream.async.clone on(#hal.device.affinity<@__device_0>) %5 : !stream.resource<*>{%0} -> !stream.resource{%0} + %8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<2xf32> in !stream.resource{%0} -> !hal.buffer_view + %9 = stream.async.clone on(#hal.device.affinity<@__device_0>) %6 : !stream.resource<*>{%0} -> !stream.resource{%0} + %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2xf32> in !stream.resource{%0} -> !hal.buffer_view + util.return %8, %10 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xf32> : index + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%0} + %2 = stream.async.clone on(#hal.device.affinity<@__device_0>) %1 : !stream.resource{%0} -> !stream.resource<*>{%0} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %3 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%0} + %4 = stream.async.clone on(#hal.device.affinity<@__device_0>) %3 : !stream.resource{%0} -> !stream.resource<*>{%0} + %5 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%2) : (tensor<2xf32> in !stream.resource<*>{%0}) -> tensor<2xf32> in !stream.resource<*>{%0} + %6 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%4) : (tensor<2xf32> in !stream.resource<*>{%0}) -> tensor<2xf32> in !stream.resource<*>{%0} + %7 = stream.async.clone on(#hal.device.affinity<@__device_0>) %5 : !stream.resource<*>{%0} -> !stream.resource{%0} + %8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<2xf32> in !stream.resource{%0} -> !hal.buffer_view + %9 = stream.async.clone on(#hal.device.affinity<@__device_0>) %6 : !stream.resource<*>{%0} -> !stream.resource{%0} + %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2xf32> in !stream.resource{%0} -> !hal.buffer_view + util.return %8, %10 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xf32> : index + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%0} + %2 = stream.async.clone on(#hal.device.affinity<@__device_0>) %1 : !stream.resource{%0} -> !stream.resource<*>{%0} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %3 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%0} + %4 = stream.async.clone on(#hal.device.affinity<@__device_0>) %3 : !stream.resource{%0} -> !stream.resource<*>{%0} + %5 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%2) : (tensor<2xf32> in !stream.resource<*>{%0}) -> tensor<2xf32> in !stream.resource<*>{%0} + %6 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%4) : (tensor<2xf32> in !stream.resource<*>{%0}) -> tensor<2xf32> in !stream.resource<*>{%0} + %7 = stream.async.clone on(#hal.device.affinity<@__device_0>) %5 : !stream.resource<*>{%0} -> !stream.resource{%0} + %8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<2xf32> in !stream.resource{%0} -> !hal.buffer_view + %9 = stream.async.clone on(#hal.device.affinity<@__device_0>) %6 : !stream.resource<*>{%0} -> !stream.resource{%0} + %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2xf32> in !stream.resource{%0} -> !hal.buffer_view + util.return %8, %10 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xf32> : index + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%0} + %2 = stream.async.clone on(#hal.device.affinity<@__device_0>) %1 : !stream.resource{%0} -> !stream.resource<*>{%0} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %3 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%0} + %4 = stream.async.clone on(#hal.device.affinity<@__device_0>) %3 : !stream.resource{%0} -> !stream.resource<*>{%0} + %5 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%2) : (tensor<2xf32> in !stream.resource<*>{%0}) -> tensor<2xf32> in !stream.resource<*>{%0} + %6 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%4) : (tensor<2xf32> in !stream.resource<*>{%0}) -> tensor<2xf32> in !stream.resource<*>{%0} + %7 = stream.async.clone on(#hal.device.affinity<@__device_0>) %5 : !stream.resource<*>{%0} -> !stream.resource{%0} + %8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<2xf32> in !stream.resource{%0} -> !hal.buffer_view + %9 = stream.async.clone on(#hal.device.affinity<@__device_0>) %6 : !stream.resource<*>{%0} -> !stream.resource{%0} + %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2xf32> in !stream.resource{%0} -> !hal.buffer_view + util.return %8, %10 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + stream.executable private @multiple_results_dispatch_0 { + stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + stream.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) { + %c0 = arith.constant 0 : index + %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %3 = tensor.empty() : tensor<2xf32> + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %5 = math.absf %in : f32 + linalg.yield %5 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xf32> : index + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%0} + %2 = stream.async.clone on(#hal.device.affinity<@__device_0>) %1 : !stream.resource{%0} -> !stream.resource<*>{%0} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %3 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%0} + %4 = stream.async.clone on(#hal.device.affinity<@__device_0>) %3 : !stream.resource{%0} -> !stream.resource<*>{%0} + %5 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%2) : (tensor<2xf32> in !stream.resource<*>{%0}) -> tensor<2xf32> in !stream.resource<*>{%0} + %6 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%4) : (tensor<2xf32> in !stream.resource<*>{%0}) -> tensor<2xf32> in !stream.resource<*>{%0} + %7 = stream.async.clone on(#hal.device.affinity<@__device_0>) %5 : !stream.resource<*>{%0} -> !stream.resource{%0} + %8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<2xf32> in !stream.resource{%0} -> !hal.buffer_view + %9 = stream.async.clone on(#hal.device.affinity<@__device_0>) %6 : !stream.resource<*>{%0} -> !stream.resource{%0} + %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2xf32> in !stream.resource{%0} -> !hal.buffer_view + util.return %8, %10 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + stream.executable private @multiple_results_dispatch_0 { + stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + stream.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) { + %c0 = arith.constant 0 : index + %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %3 = tensor.empty() : tensor<2xf32> + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %5 = math.absf %in : f32 + linalg.yield %5 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xf32> : index + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%0} + %2 = stream.async.clone on(#hal.device.affinity<@__device_0>) %1 : !stream.resource{%0} -> !stream.resource<*>{%0} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %3 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%0} + %4 = stream.async.clone on(#hal.device.affinity<@__device_0>) %3 : !stream.resource{%0} -> !stream.resource<*>{%0} + %5 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%2) : (tensor<2xf32> in !stream.resource<*>{%0}) -> tensor<2xf32> in !stream.resource<*>{%0} + %6 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%4) : (tensor<2xf32> in !stream.resource<*>{%0}) -> tensor<2xf32> in !stream.resource<*>{%0} + %7 = stream.async.clone on(#hal.device.affinity<@__device_0>) %5 : !stream.resource<*>{%0} -> !stream.resource{%0} + %8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<2xf32> in !stream.resource{%0} -> !hal.buffer_view + %9 = stream.async.clone on(#hal.device.affinity<@__device_0>) %6 : !stream.resource<*>{%0} -> !stream.resource{%0} + %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2xf32> in !stream.resource{%0} -> !hal.buffer_view + util.return %8, %10 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After IPOPass (iree-util-ipo) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + stream.executable private @multiple_results_dispatch_0 { + stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + stream.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) { + %c0 = arith.constant 0 : index + %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %3 = tensor.empty() : tensor<2xf32> + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %5 = math.absf %in : f32 + linalg.yield %5 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xf32> : index + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%0} + %2 = stream.async.clone on(#hal.device.affinity<@__device_0>) %1 : !stream.resource{%0} -> !stream.resource<*>{%0} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %3 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%0} + %4 = stream.async.clone on(#hal.device.affinity<@__device_0>) %3 : !stream.resource{%0} -> !stream.resource<*>{%0} + %5 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%2) : (tensor<2xf32> in !stream.resource<*>{%0}) -> tensor<2xf32> in !stream.resource<*>{%0} + %6 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%4) : (tensor<2xf32> in !stream.resource<*>{%0}) -> tensor<2xf32> in !stream.resource<*>{%0} + %7 = stream.async.clone on(#hal.device.affinity<@__device_0>) %5 : !stream.resource<*>{%0} -> !stream.resource{%0} + %8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<2xf32> in !stream.resource{%0} -> !hal.buffer_view + %9 = stream.async.clone on(#hal.device.affinity<@__device_0>) %6 : !stream.resource<*>{%0} -> !stream.resource{%0} + %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2xf32> in !stream.resource{%0} -> !hal.buffer_view + util.return %8, %10 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After CombineInitializersPass (iree-util-combine-initializers) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + stream.executable private @multiple_results_dispatch_0 { + stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + stream.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) { + %c0 = arith.constant 0 : index + %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %3 = tensor.empty() : tensor<2xf32> + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %5 = math.absf %in : f32 + linalg.yield %5 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xf32> : index + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%0} + %2 = stream.async.clone on(#hal.device.affinity<@__device_0>) %1 : !stream.resource{%0} -> !stream.resource<*>{%0} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %3 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%0} + %4 = stream.async.clone on(#hal.device.affinity<@__device_0>) %3 : !stream.resource{%0} -> !stream.resource<*>{%0} + %5 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%2) : (tensor<2xf32> in !stream.resource<*>{%0}) -> tensor<2xf32> in !stream.resource<*>{%0} + %6 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%4) : (tensor<2xf32> in !stream.resource<*>{%0}) -> tensor<2xf32> in !stream.resource<*>{%0} + %7 = stream.async.clone on(#hal.device.affinity<@__device_0>) %5 : !stream.resource<*>{%0} -> !stream.resource{%0} + %8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<2xf32> in !stream.resource{%0} -> !hal.buffer_view + %9 = stream.async.clone on(#hal.device.affinity<@__device_0>) %6 : !stream.resource<*>{%0} -> !stream.resource{%0} + %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2xf32> in !stream.resource{%0} -> !hal.buffer_view + util.return %8, %10 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After Canonicalizer (canonicalize) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xf32> : index + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%0} + %2 = stream.async.clone on(#hal.device.affinity<@__device_0>) %1 : !stream.resource{%0} -> !stream.resource<*>{%0} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %3 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%0} + %4 = stream.async.clone on(#hal.device.affinity<@__device_0>) %3 : !stream.resource{%0} -> !stream.resource<*>{%0} + %5 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%2) : (tensor<2xf32> in !stream.resource<*>{%0}) -> tensor<2xf32> in !stream.resource<*>{%0} + %6 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%4) : (tensor<2xf32> in !stream.resource<*>{%0}) -> tensor<2xf32> in !stream.resource<*>{%0} + %7 = stream.async.clone on(#hal.device.affinity<@__device_0>) %5 : !stream.resource<*>{%0} -> !stream.resource{%0} + %8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<2xf32> in !stream.resource{%0} -> !hal.buffer_view + %9 = stream.async.clone on(#hal.device.affinity<@__device_0>) %6 : !stream.resource<*>{%0} -> !stream.resource{%0} + %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2xf32> in !stream.resource{%0} -> !hal.buffer_view + util.return %8, %10 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After CSE (cse) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xf32> : index + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%0} + %2 = stream.async.clone on(#hal.device.affinity<@__device_0>) %1 : !stream.resource{%0} -> !stream.resource<*>{%0} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %3 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%0} + %4 = stream.async.clone on(#hal.device.affinity<@__device_0>) %3 : !stream.resource{%0} -> !stream.resource<*>{%0} + %5 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%2) : (tensor<2xf32> in !stream.resource<*>{%0}) -> tensor<2xf32> in !stream.resource<*>{%0} + %6 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%4) : (tensor<2xf32> in !stream.resource<*>{%0}) -> tensor<2xf32> in !stream.resource<*>{%0} + %7 = stream.async.clone on(#hal.device.affinity<@__device_0>) %5 : !stream.resource<*>{%0} -> !stream.resource{%0} + %8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<2xf32> in !stream.resource{%0} -> !hal.buffer_view + %9 = stream.async.clone on(#hal.device.affinity<@__device_0>) %6 : !stream.resource<*>{%0} -> !stream.resource{%0} + %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2xf32> in !stream.resource{%0} -> !hal.buffer_view + util.return %8, %10 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xf32> : index + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%0} + %2 = stream.async.clone on(#hal.device.affinity<@__device_0>) %1 : !stream.resource{%0} -> !stream.resource<*>{%0} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %3 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%0} + %4 = stream.async.clone on(#hal.device.affinity<@__device_0>) %3 : !stream.resource{%0} -> !stream.resource<*>{%0} + %5 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%2) : (tensor<2xf32> in !stream.resource<*>{%0}) -> tensor<2xf32> in !stream.resource<*>{%0} + %6 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%4) : (tensor<2xf32> in !stream.resource<*>{%0}) -> tensor<2xf32> in !stream.resource<*>{%0} + %7 = stream.async.clone on(#hal.device.affinity<@__device_0>) %5 : !stream.resource<*>{%0} -> !stream.resource{%0} + %8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<2xf32> in !stream.resource{%0} -> !hal.buffer_view + %9 = stream.async.clone on(#hal.device.affinity<@__device_0>) %6 : !stream.resource<*>{%0} -> !stream.resource{%0} + %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2xf32> in !stream.resource{%0} -> !hal.buffer_view + util.return %8, %10 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xf32> : index + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%0} + %2 = stream.async.clone on(#hal.device.affinity<@__device_0>) %1 : !stream.resource{%0} -> !stream.resource<*>{%0} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %3 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%0} + %4 = stream.async.clone on(#hal.device.affinity<@__device_0>) %3 : !stream.resource{%0} -> !stream.resource<*>{%0} + %5 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%2) : (tensor<2xf32> in !stream.resource<*>{%0}) -> tensor<2xf32> in !stream.resource<*>{%0} + %6 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%4) : (tensor<2xf32> in !stream.resource<*>{%0}) -> tensor<2xf32> in !stream.resource<*>{%0} + %7 = stream.async.clone on(#hal.device.affinity<@__device_0>) %5 : !stream.resource<*>{%0} -> !stream.resource{%0} + %8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<2xf32> in !stream.resource{%0} -> !hal.buffer_view + %9 = stream.async.clone on(#hal.device.affinity<@__device_0>) %6 : !stream.resource<*>{%0} -> !stream.resource{%0} + %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2xf32> in !stream.resource{%0} -> !hal.buffer_view + util.return %8, %10 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xf32> : index + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%0} + %2 = stream.async.clone on(#hal.device.affinity<@__device_0>) %1 : !stream.resource{%0} -> !stream.resource<*>{%0} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %3 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%0} + %4 = stream.async.clone on(#hal.device.affinity<@__device_0>) %3 : !stream.resource{%0} -> !stream.resource<*>{%0} + %5 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%2) : (tensor<2xf32> in !stream.resource<*>{%0}) -> tensor<2xf32> in !stream.resource<*>{%0} + %6 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%4) : (tensor<2xf32> in !stream.resource<*>{%0}) -> tensor<2xf32> in !stream.resource<*>{%0} + %7 = stream.async.clone on(#hal.device.affinity<@__device_0>) %5 : !stream.resource<*>{%0} -> !stream.resource{%0} + %8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<2xf32> in !stream.resource{%0} -> !hal.buffer_view + %9 = stream.async.clone on(#hal.device.affinity<@__device_0>) %6 : !stream.resource<*>{%0} -> !stream.resource{%0} + %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2xf32> in !stream.resource{%0} -> !hal.buffer_view + util.return %8, %10 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + stream.executable private @multiple_results_dispatch_0 { + stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + stream.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) { + %c0 = arith.constant 0 : index + %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %3 = tensor.empty() : tensor<2xf32> + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %5 = math.absf %in : f32 + linalg.yield %5 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xf32> : index + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%0} + %2 = stream.async.clone on(#hal.device.affinity<@__device_0>) %1 : !stream.resource{%0} -> !stream.resource<*>{%0} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %3 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%0} + %4 = stream.async.clone on(#hal.device.affinity<@__device_0>) %3 : !stream.resource{%0} -> !stream.resource<*>{%0} + %5 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%2) : (tensor<2xf32> in !stream.resource<*>{%0}) -> tensor<2xf32> in !stream.resource<*>{%0} + %6 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%4) : (tensor<2xf32> in !stream.resource<*>{%0}) -> tensor<2xf32> in !stream.resource<*>{%0} + %7 = stream.async.clone on(#hal.device.affinity<@__device_0>) %5 : !stream.resource<*>{%0} -> !stream.resource{%0} + %8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<2xf32> in !stream.resource{%0} -> !hal.buffer_view + %9 = stream.async.clone on(#hal.device.affinity<@__device_0>) %6 : !stream.resource<*>{%0} -> !stream.resource{%0} + %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2xf32> in !stream.resource{%0} -> !hal.buffer_view + util.return %8, %10 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + stream.executable private @multiple_results_dispatch_0 { + stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + stream.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) { + %c0 = arith.constant 0 : index + %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %3 = tensor.empty() : tensor<2xf32> + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %5 = math.absf %in : f32 + linalg.yield %5 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xf32> : index + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%0} + %2 = stream.async.clone on(#hal.device.affinity<@__device_0>) %1 : !stream.resource{%0} -> !stream.resource<*>{%0} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %3 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%0} + %4 = stream.async.clone on(#hal.device.affinity<@__device_0>) %3 : !stream.resource{%0} -> !stream.resource<*>{%0} + %5 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%2) : (tensor<2xf32> in !stream.resource<*>{%0}) -> tensor<2xf32> in !stream.resource<*>{%0} + %6 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%4) : (tensor<2xf32> in !stream.resource<*>{%0}) -> tensor<2xf32> in !stream.resource<*>{%0} + %7 = stream.async.clone on(#hal.device.affinity<@__device_0>) %5 : !stream.resource<*>{%0} -> !stream.resource{%0} + %8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<2xf32> in !stream.resource{%0} -> !hal.buffer_view + %9 = stream.async.clone on(#hal.device.affinity<@__device_0>) %6 : !stream.resource<*>{%0} -> !stream.resource{%0} + %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2xf32> in !stream.resource{%0} -> !hal.buffer_view + util.return %8, %10 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After IPOPass (iree-util-ipo) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + stream.executable private @multiple_results_dispatch_0 { + stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + stream.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) { + %c0 = arith.constant 0 : index + %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %3 = tensor.empty() : tensor<2xf32> + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %5 = math.absf %in : f32 + linalg.yield %5 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xf32> : index + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%0} + %2 = stream.async.clone on(#hal.device.affinity<@__device_0>) %1 : !stream.resource{%0} -> !stream.resource<*>{%0} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %3 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%0} + %4 = stream.async.clone on(#hal.device.affinity<@__device_0>) %3 : !stream.resource{%0} -> !stream.resource<*>{%0} + %5 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%2) : (tensor<2xf32> in !stream.resource<*>{%0}) -> tensor<2xf32> in !stream.resource<*>{%0} + %6 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%4) : (tensor<2xf32> in !stream.resource<*>{%0}) -> tensor<2xf32> in !stream.resource<*>{%0} + %7 = stream.async.clone on(#hal.device.affinity<@__device_0>) %5 : !stream.resource<*>{%0} -> !stream.resource{%0} + %8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<2xf32> in !stream.resource{%0} -> !hal.buffer_view + %9 = stream.async.clone on(#hal.device.affinity<@__device_0>) %6 : !stream.resource<*>{%0} -> !stream.resource{%0} + %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2xf32> in !stream.resource{%0} -> !hal.buffer_view + util.return %8, %10 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After FixedPointIteratorPass (iree-util-fixed-point-iterator) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + stream.executable private @multiple_results_dispatch_0 { + stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + stream.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) { + %c0 = arith.constant 0 : index + %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %3 = tensor.empty() : tensor<2xf32> + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %5 = math.absf %in : f32 + linalg.yield %5 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xf32> : index + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%0} + %2 = stream.async.clone on(#hal.device.affinity<@__device_0>) %1 : !stream.resource{%0} -> !stream.resource<*>{%0} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %3 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%0} + %4 = stream.async.clone on(#hal.device.affinity<@__device_0>) %3 : !stream.resource{%0} -> !stream.resource<*>{%0} + %5 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%2) : (tensor<2xf32> in !stream.resource<*>{%0}) -> tensor<2xf32> in !stream.resource<*>{%0} + %6 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%4) : (tensor<2xf32> in !stream.resource<*>{%0}) -> tensor<2xf32> in !stream.resource<*>{%0} + %7 = stream.async.clone on(#hal.device.affinity<@__device_0>) %5 : !stream.resource<*>{%0} -> !stream.resource{%0} + %8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<2xf32> in !stream.resource{%0} -> !hal.buffer_view + %9 = stream.async.clone on(#hal.device.affinity<@__device_0>) %6 : !stream.resource<*>{%0} -> !stream.resource{%0} + %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2xf32> in !stream.resource{%0} -> !hal.buffer_view + util.return %8, %10 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After SpecializeEncodingsPass (iree-stream-specialize-encodings) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + stream.executable private @multiple_results_dispatch_0 { + stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + stream.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) { + %c0 = arith.constant 0 : index + %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %3 = tensor.empty() : tensor<2xf32> + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %5 = math.absf %in : f32 + linalg.yield %5 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2xf32> : index + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%0} + %2 = stream.async.clone on(#hal.device.affinity<@__device_0>) %1 : !stream.resource{%0} -> !stream.resource<*>{%0} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %3 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%0} + %4 = stream.async.clone on(#hal.device.affinity<@__device_0>) %3 : !stream.resource{%0} -> !stream.resource<*>{%0} + %5 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%2) : (tensor<2xf32> in !stream.resource<*>{%0}) -> tensor<2xf32> in !stream.resource<*>{%0} + %6 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%4) : (tensor<2xf32> in !stream.resource<*>{%0}) -> tensor<2xf32> in !stream.resource<*>{%0} + %7 = stream.async.clone on(#hal.device.affinity<@__device_0>) %5 : !stream.resource<*>{%0} -> !stream.resource{%0} + %8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<2xf32> in !stream.resource{%0} -> !hal.buffer_view + %9 = stream.async.clone on(#hal.device.affinity<@__device_0>) %6 : !stream.resource<*>{%0} -> !stream.resource{%0} + %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2xf32> in !stream.resource{%0} -> !hal.buffer_view + util.return %8, %10 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After EncodeDeviceTensorsPass (iree-stream-encode-device-tensors) //----- // +stream.executable private @multiple_results_dispatch_0 { + stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + stream.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) { + %c0 = arith.constant 0 : index + %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %3 = tensor.empty() : tensor<2xf32> + %4 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %5 = math.absf %in : f32 + linalg.yield %5 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } +} + +// -----// IR Dump After EncodeHostTensorsPass (iree-stream-encode-host-tensors) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %1 = stream.async.clone on(#hal.device.affinity<@__device_0>) %0 : !stream.resource{%c8} -> !stream.resource<*>{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %3 = stream.async.clone on(#hal.device.affinity<@__device_0>) %2 : !stream.resource{%c8} -> !stream.resource<*>{%c8} + %4 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1[%c0 to %c8 for %c8]) : (!stream.resource<*>{%c8}) -> !stream.resource<*>{%c8} + %5 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%3[%c0 to %c8 for %c8]) : (!stream.resource<*>{%c8}) -> !stream.resource<*>{%c8} + %6 = stream.async.clone on(#hal.device.affinity<@__device_0>) %4 : !stream.resource<*>{%c8} -> !stream.resource{%c8} + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %8 = stream.async.clone on(#hal.device.affinity<@__device_0>) %5 : !stream.resource<*>{%c8} -> !stream.resource{%c8} + %9 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %8 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %7, %9 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After MaterializeEncodingsPass (iree-stream-materialize-encodings) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + stream.executable private @multiple_results_dispatch_0 { + stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + stream.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) { + %c0 = arith.constant 0 : index + %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %3 = tensor.empty() : tensor<2xf32> + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %5 = math.absf %in : f32 + linalg.yield %5 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %1 = stream.async.clone on(#hal.device.affinity<@__device_0>) %0 : !stream.resource{%c8} -> !stream.resource<*>{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %3 = stream.async.clone on(#hal.device.affinity<@__device_0>) %2 : !stream.resource{%c8} -> !stream.resource<*>{%c8} + %4 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1[%c0 to %c8 for %c8]) : (!stream.resource<*>{%c8}) -> !stream.resource<*>{%c8} + %5 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%3[%c0 to %c8 for %c8]) : (!stream.resource<*>{%c8}) -> !stream.resource<*>{%c8} + %6 = stream.async.clone on(#hal.device.affinity<@__device_0>) %4 : !stream.resource<*>{%c8} -> !stream.resource{%c8} + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %8 = stream.async.clone on(#hal.device.affinity<@__device_0>) %5 : !stream.resource<*>{%c8} -> !stream.resource{%c8} + %9 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %8 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %7, %9 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After LayoutSlicesPass (iree-stream-layout-slices) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %1 = stream.async.clone on(#hal.device.affinity<@__device_0>) %0 : !stream.resource{%c8} -> !stream.resource<*>{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %3 = stream.async.clone on(#hal.device.affinity<@__device_0>) %2 : !stream.resource{%c8} -> !stream.resource<*>{%c8} + %4 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1[%c0 to %c8 for %c8]) : (!stream.resource<*>{%c8}) -> !stream.resource<*>{%c8} + %5 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%3[%c0 to %c8 for %c8]) : (!stream.resource<*>{%c8}) -> !stream.resource<*>{%c8} + %6 = stream.async.clone on(#hal.device.affinity<@__device_0>) %4 : !stream.resource<*>{%c8} -> !stream.resource{%c8} + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %8 = stream.async.clone on(#hal.device.affinity<@__device_0>) %5 : !stream.resource<*>{%c8} -> !stream.resource{%c8} + %9 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %8 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %7, %9 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After Canonicalizer (canonicalize) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %1 = stream.async.clone on(#hal.device.affinity<@__device_0>) %0 : !stream.resource{%c8} -> !stream.resource<*>{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %3 = stream.async.clone on(#hal.device.affinity<@__device_0>) %2 : !stream.resource{%c8} -> !stream.resource<*>{%c8} + %4 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1[%c0 to %c8 for %c8]) : (!stream.resource<*>{%c8}) -> !stream.resource<*>{%c8} + %5 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%3[%c0 to %c8 for %c8]) : (!stream.resource<*>{%c8}) -> !stream.resource<*>{%c8} + %6 = stream.async.clone on(#hal.device.affinity<@__device_0>) %4 : !stream.resource<*>{%c8} -> !stream.resource{%c8} + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %8 = stream.async.clone on(#hal.device.affinity<@__device_0>) %5 : !stream.resource<*>{%c8} -> !stream.resource{%c8} + %9 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %8 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %7, %9 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After CSE (cse) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %1 = stream.async.clone on(#hal.device.affinity<@__device_0>) %0 : !stream.resource{%c8} -> !stream.resource<*>{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %3 = stream.async.clone on(#hal.device.affinity<@__device_0>) %2 : !stream.resource{%c8} -> !stream.resource<*>{%c8} + %4 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1[%c0 to %c8 for %c8]) : (!stream.resource<*>{%c8}) -> !stream.resource<*>{%c8} + %5 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%3[%c0 to %c8 for %c8]) : (!stream.resource<*>{%c8}) -> !stream.resource<*>{%c8} + %6 = stream.async.clone on(#hal.device.affinity<@__device_0>) %4 : !stream.resource<*>{%c8} -> !stream.resource{%c8} + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %8 = stream.async.clone on(#hal.device.affinity<@__device_0>) %5 : !stream.resource<*>{%c8} -> !stream.resource{%c8} + %9 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %8 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %7, %9 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %1 = stream.async.clone on(#hal.device.affinity<@__device_0>) %0 : !stream.resource{%c8} -> !stream.resource<*>{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %3 = stream.async.clone on(#hal.device.affinity<@__device_0>) %2 : !stream.resource{%c8} -> !stream.resource<*>{%c8} + %4 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1[%c0 to %c8 for %c8]) : (!stream.resource<*>{%c8}) -> !stream.resource<*>{%c8} + %5 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%3[%c0 to %c8 for %c8]) : (!stream.resource<*>{%c8}) -> !stream.resource<*>{%c8} + %6 = stream.async.clone on(#hal.device.affinity<@__device_0>) %4 : !stream.resource<*>{%c8} -> !stream.resource{%c8} + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %8 = stream.async.clone on(#hal.device.affinity<@__device_0>) %5 : !stream.resource<*>{%c8} -> !stream.resource{%c8} + %9 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %8 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %7, %9 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %1 = stream.async.clone on(#hal.device.affinity<@__device_0>) %0 : !stream.resource{%c8} -> !stream.resource<*>{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %3 = stream.async.clone on(#hal.device.affinity<@__device_0>) %2 : !stream.resource{%c8} -> !stream.resource<*>{%c8} + %4 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1[%c0 to %c8 for %c8]) : (!stream.resource<*>{%c8}) -> !stream.resource<*>{%c8} + %5 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%3[%c0 to %c8 for %c8]) : (!stream.resource<*>{%c8}) -> !stream.resource<*>{%c8} + %6 = stream.async.clone on(#hal.device.affinity<@__device_0>) %4 : !stream.resource<*>{%c8} -> !stream.resource{%c8} + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %8 = stream.async.clone on(#hal.device.affinity<@__device_0>) %5 : !stream.resource<*>{%c8} -> !stream.resource{%c8} + %9 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %8 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %7, %9 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %1 = stream.async.clone on(#hal.device.affinity<@__device_0>) %0 : !stream.resource{%c8} -> !stream.resource<*>{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %3 = stream.async.clone on(#hal.device.affinity<@__device_0>) %2 : !stream.resource{%c8} -> !stream.resource<*>{%c8} + %4 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1[%c0 to %c8 for %c8]) : (!stream.resource<*>{%c8}) -> !stream.resource<*>{%c8} + %5 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%3[%c0 to %c8 for %c8]) : (!stream.resource<*>{%c8}) -> !stream.resource<*>{%c8} + %6 = stream.async.clone on(#hal.device.affinity<@__device_0>) %4 : !stream.resource<*>{%c8} -> !stream.resource{%c8} + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %8 = stream.async.clone on(#hal.device.affinity<@__device_0>) %5 : !stream.resource<*>{%c8} -> !stream.resource{%c8} + %9 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %8 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %7, %9 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + stream.executable private @multiple_results_dispatch_0 { + stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + stream.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) { + %c0 = arith.constant 0 : index + %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %3 = tensor.empty() : tensor<2xf32> + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %5 = math.absf %in : f32 + linalg.yield %5 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %1 = stream.async.clone on(#hal.device.affinity<@__device_0>) %0 : !stream.resource{%c8} -> !stream.resource<*>{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %3 = stream.async.clone on(#hal.device.affinity<@__device_0>) %2 : !stream.resource{%c8} -> !stream.resource<*>{%c8} + %4 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1[%c0 to %c8 for %c8]) : (!stream.resource<*>{%c8}) -> !stream.resource<*>{%c8} + %5 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%3[%c0 to %c8 for %c8]) : (!stream.resource<*>{%c8}) -> !stream.resource<*>{%c8} + %6 = stream.async.clone on(#hal.device.affinity<@__device_0>) %4 : !stream.resource<*>{%c8} -> !stream.resource{%c8} + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %8 = stream.async.clone on(#hal.device.affinity<@__device_0>) %5 : !stream.resource<*>{%c8} -> !stream.resource{%c8} + %9 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %8 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %7, %9 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + stream.executable private @multiple_results_dispatch_0 { + stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + stream.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) { + %c0 = arith.constant 0 : index + %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %3 = tensor.empty() : tensor<2xf32> + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %5 = math.absf %in : f32 + linalg.yield %5 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %1 = stream.async.clone on(#hal.device.affinity<@__device_0>) %0 : !stream.resource{%c8} -> !stream.resource<*>{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %3 = stream.async.clone on(#hal.device.affinity<@__device_0>) %2 : !stream.resource{%c8} -> !stream.resource<*>{%c8} + %4 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1[%c0 to %c8 for %c8]) : (!stream.resource<*>{%c8}) -> !stream.resource<*>{%c8} + %5 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%3[%c0 to %c8 for %c8]) : (!stream.resource<*>{%c8}) -> !stream.resource<*>{%c8} + %6 = stream.async.clone on(#hal.device.affinity<@__device_0>) %4 : !stream.resource<*>{%c8} -> !stream.resource{%c8} + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %8 = stream.async.clone on(#hal.device.affinity<@__device_0>) %5 : !stream.resource<*>{%c8} -> !stream.resource{%c8} + %9 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %8 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %7, %9 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After IPOPass (iree-util-ipo) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + stream.executable private @multiple_results_dispatch_0 { + stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + stream.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) { + %c0 = arith.constant 0 : index + %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %3 = tensor.empty() : tensor<2xf32> + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %5 = math.absf %in : f32 + linalg.yield %5 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %1 = stream.async.clone on(#hal.device.affinity<@__device_0>) %0 : !stream.resource{%c8} -> !stream.resource<*>{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %3 = stream.async.clone on(#hal.device.affinity<@__device_0>) %2 : !stream.resource{%c8} -> !stream.resource<*>{%c8} + %4 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1[%c0 to %c8 for %c8]) : (!stream.resource<*>{%c8}) -> !stream.resource<*>{%c8} + %5 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%3[%c0 to %c8 for %c8]) : (!stream.resource<*>{%c8}) -> !stream.resource<*>{%c8} + %6 = stream.async.clone on(#hal.device.affinity<@__device_0>) %4 : !stream.resource<*>{%c8} -> !stream.resource{%c8} + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %8 = stream.async.clone on(#hal.device.affinity<@__device_0>) %5 : !stream.resource<*>{%c8} -> !stream.resource{%c8} + %9 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %8 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %7, %9 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After VerifyLoweringToAsyncResourcesPass (iree-stream-verify-lowering-to-async-resources) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + stream.executable private @multiple_results_dispatch_0 { + stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + stream.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) { + %c0 = arith.constant 0 : index + %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %3 = tensor.empty() : tensor<2xf32> + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %5 = math.absf %in : f32 + linalg.yield %5 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %1 = stream.async.clone on(#hal.device.affinity<@__device_0>) %0 : !stream.resource{%c8} -> !stream.resource<*>{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %3 = stream.async.clone on(#hal.device.affinity<@__device_0>) %2 : !stream.resource{%c8} -> !stream.resource<*>{%c8} + %4 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1[%c0 to %c8 for %c8]) : (!stream.resource<*>{%c8}) -> !stream.resource<*>{%c8} + %5 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%3[%c0 to %c8 for %c8]) : (!stream.resource<*>{%c8}) -> !stream.resource<*>{%c8} + %6 = stream.async.clone on(#hal.device.affinity<@__device_0>) %4 : !stream.resource<*>{%c8} -> !stream.resource{%c8} + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %8 = stream.async.clone on(#hal.device.affinity<@__device_0>) %5 : !stream.resource<*>{%c8} -> !stream.resource{%c8} + %9 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %8 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %7, %9 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After MaterializeCopyOnWritePass (iree-stream-materialize-copy-on-write) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %1 = stream.async.clone on(#hal.device.affinity<@__device_0>) %0 : !stream.resource{%c8} -> !stream.resource<*>{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %3 = stream.async.clone on(#hal.device.affinity<@__device_0>) %2 : !stream.resource{%c8} -> !stream.resource<*>{%c8} + %4 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1[%c0 to %c8 for %c8]) : (!stream.resource<*>{%c8}) -> !stream.resource<*>{%c8} + %5 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%3[%c0 to %c8 for %c8]) : (!stream.resource<*>{%c8}) -> !stream.resource<*>{%c8} + %6 = stream.async.clone on(#hal.device.affinity<@__device_0>) %4 : !stream.resource<*>{%c8} -> !stream.resource{%c8} + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %8 = stream.async.clone on(#hal.device.affinity<@__device_0>) %5 : !stream.resource<*>{%c8} -> !stream.resource{%c8} + %9 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %8 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %7, %9 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After Canonicalizer (canonicalize) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %1 = stream.async.clone on(#hal.device.affinity<@__device_0>) %0 : !stream.resource{%c8} -> !stream.resource<*>{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %3 = stream.async.clone on(#hal.device.affinity<@__device_0>) %2 : !stream.resource{%c8} -> !stream.resource<*>{%c8} + %4 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1[%c0 to %c8 for %c8]) : (!stream.resource<*>{%c8}) -> !stream.resource<*>{%c8} + %5 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%3[%c0 to %c8 for %c8]) : (!stream.resource<*>{%c8}) -> !stream.resource<*>{%c8} + %6 = stream.async.clone on(#hal.device.affinity<@__device_0>) %4 : !stream.resource<*>{%c8} -> !stream.resource{%c8} + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %8 = stream.async.clone on(#hal.device.affinity<@__device_0>) %5 : !stream.resource<*>{%c8} -> !stream.resource{%c8} + %9 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %8 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %7, %9 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After ElideAsyncCopiesPass (iree-stream-elide-async-copies) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + stream.executable private @multiple_results_dispatch_0 { + stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + stream.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) { + %c0 = arith.constant 0 : index + %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %3 = tensor.empty() : tensor<2xf32> + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %5 = math.absf %in : f32 + linalg.yield %5 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %1 = stream.async.clone on(#hal.device.affinity<@__device_0>) %0 : !stream.resource{%c8} -> !stream.resource<*>{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %3 = stream.async.clone on(#hal.device.affinity<@__device_0>) %2 : !stream.resource{%c8} -> !stream.resource<*>{%c8} + %4 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1[%c0 to %c8 for %c8]) : (!stream.resource<*>{%c8}) -> !stream.resource<*>{%c8} + %5 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%3[%c0 to %c8 for %c8]) : (!stream.resource<*>{%c8}) -> !stream.resource<*>{%c8} + %6 = stream.async.clone on(#hal.device.affinity<@__device_0>) %4 : !stream.resource<*>{%c8} -> !stream.resource{%c8} + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %8 = stream.async.clone on(#hal.device.affinity<@__device_0>) %5 : !stream.resource<*>{%c8} -> !stream.resource{%c8} + %9 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %8 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %7, %9 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After Canonicalizer (canonicalize) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %1 = stream.async.clone on(#hal.device.affinity<@__device_0>) %0 : !stream.resource{%c8} -> !stream.resource<*>{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %3 = stream.async.clone on(#hal.device.affinity<@__device_0>) %2 : !stream.resource{%c8} -> !stream.resource<*>{%c8} + %4 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1[%c0 to %c8 for %c8]) : (!stream.resource<*>{%c8}) -> !stream.resource<*>{%c8} + %5 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%3[%c0 to %c8 for %c8]) : (!stream.resource<*>{%c8}) -> !stream.resource<*>{%c8} + %6 = stream.async.clone on(#hal.device.affinity<@__device_0>) %4 : !stream.resource<*>{%c8} -> !stream.resource{%c8} + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %8 = stream.async.clone on(#hal.device.affinity<@__device_0>) %5 : !stream.resource<*>{%c8} -> !stream.resource{%c8} + %9 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %8 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %7, %9 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After EmplaceAllocationsPass (iree-stream-emplace-allocations) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %1 = stream.async.clone on(#hal.device.affinity<@__device_0>) %0 : !stream.resource{%c8} -> !stream.resource<*>{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %3 = stream.async.clone on(#hal.device.affinity<@__device_0>) %2 : !stream.resource{%c8} -> !stream.resource<*>{%c8} + %4 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1[%c0 to %c8 for %c8]) : (!stream.resource<*>{%c8}) -> !stream.resource<*>{%c8} + %5 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%3[%c0 to %c8 for %c8]) : (!stream.resource<*>{%c8}) -> !stream.resource<*>{%c8} + %6 = stream.async.clone on(#hal.device.affinity<@__device_0>) %4 : !stream.resource<*>{%c8} -> !stream.resource{%c8} + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %8 = stream.async.clone on(#hal.device.affinity<@__device_0>) %5 : !stream.resource<*>{%c8} -> !stream.resource{%c8} + %9 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %8 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %7, %9 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After RefineUsagePass (iree-stream-refine-usage) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + stream.executable private @multiple_results_dispatch_0 { + stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + stream.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) { + %c0 = arith.constant 0 : index + %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %3 = tensor.empty() : tensor<2xf32> + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %5 = math.absf %in : f32 + linalg.yield %5 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %1 = stream.async.clone on(#hal.device.affinity<@__device_0>) %0 : !stream.resource{%c8} -> !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %3 = stream.async.clone on(#hal.device.affinity<@__device_0>) %2 : !stream.resource{%c8} -> !stream.resource{%c8} + %4 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1[%c0 to %c8 for %c8]) : (!stream.resource{%c8}) -> !stream.resource{%c8} + %5 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%3[%c0 to %c8 for %c8]) : (!stream.resource{%c8}) -> !stream.resource{%c8} + %6 = stream.async.clone on(#hal.device.affinity<@__device_0>) %4 : !stream.resource{%c8} -> !stream.resource{%c8} + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %8 = stream.async.clone on(#hal.device.affinity<@__device_0>) %5 : !stream.resource{%c8} -> !stream.resource{%c8} + %9 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %8 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %7, %9 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After Canonicalizer (canonicalize) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %1 = stream.async.clone on(#hal.device.affinity<@__device_0>) %0 : !stream.resource{%c8} -> !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %3 = stream.async.clone on(#hal.device.affinity<@__device_0>) %2 : !stream.resource{%c8} -> !stream.resource{%c8} + %4 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1[%c0 to %c8 for %c8]) : (!stream.resource{%c8}) -> !stream.resource{%c8} + %5 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%3[%c0 to %c8 for %c8]) : (!stream.resource{%c8}) -> !stream.resource{%c8} + %6 = stream.async.clone on(#hal.device.affinity<@__device_0>) %4 : !stream.resource{%c8} -> !stream.resource{%c8} + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %8 = stream.async.clone on(#hal.device.affinity<@__device_0>) %5 : !stream.resource{%c8} -> !stream.resource{%c8} + %9 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %8 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %7, %9 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After ElideAsyncCopiesPass (iree-stream-elide-async-copies) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + stream.executable private @multiple_results_dispatch_0 { + stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + stream.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) { + %c0 = arith.constant 0 : index + %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %3 = tensor.empty() : tensor<2xf32> + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %5 = math.absf %in : f32 + linalg.yield %5 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %2 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%0[%c0 to %c8 for %c8]) : (!stream.resource{%c8}) -> !stream.resource{%c8} + %3 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1[%c0 to %c8 for %c8]) : (!stream.resource{%c8}) -> !stream.resource{%c8} + %4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %3 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %4, %5 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After Canonicalizer (canonicalize) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %2 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%0[%c0 to %c8 for %c8]) : (!stream.resource{%c8}) -> !stream.resource{%c8} + %3 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1[%c0 to %c8 for %c8]) : (!stream.resource{%c8}) -> !stream.resource{%c8} + %4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %3 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %4, %5 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After CSE (cse) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %2 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%0[%c0 to %c8 for %c8]) : (!stream.resource{%c8}) -> !stream.resource{%c8} + %3 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1[%c0 to %c8 for %c8]) : (!stream.resource{%c8}) -> !stream.resource{%c8} + %4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %3 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %4, %5 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %2 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%0[%c0 to %c8 for %c8]) : (!stream.resource{%c8}) -> !stream.resource{%c8} + %3 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1[%c0 to %c8 for %c8]) : (!stream.resource{%c8}) -> !stream.resource{%c8} + %4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %3 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %4, %5 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %2 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%0[%c0 to %c8 for %c8]) : (!stream.resource{%c8}) -> !stream.resource{%c8} + %3 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1[%c0 to %c8 for %c8]) : (!stream.resource{%c8}) -> !stream.resource{%c8} + %4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %3 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %4, %5 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %2 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%0[%c0 to %c8 for %c8]) : (!stream.resource{%c8}) -> !stream.resource{%c8} + %3 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1[%c0 to %c8 for %c8]) : (!stream.resource{%c8}) -> !stream.resource{%c8} + %4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %3 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %4, %5 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + stream.executable private @multiple_results_dispatch_0 { + stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + stream.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) { + %c0 = arith.constant 0 : index + %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %3 = tensor.empty() : tensor<2xf32> + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %5 = math.absf %in : f32 + linalg.yield %5 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %2 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%0[%c0 to %c8 for %c8]) : (!stream.resource{%c8}) -> !stream.resource{%c8} + %3 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1[%c0 to %c8 for %c8]) : (!stream.resource{%c8}) -> !stream.resource{%c8} + %4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %3 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %4, %5 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + stream.executable private @multiple_results_dispatch_0 { + stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + stream.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) { + %c0 = arith.constant 0 : index + %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %3 = tensor.empty() : tensor<2xf32> + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %5 = math.absf %in : f32 + linalg.yield %5 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %2 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%0[%c0 to %c8 for %c8]) : (!stream.resource{%c8}) -> !stream.resource{%c8} + %3 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1[%c0 to %c8 for %c8]) : (!stream.resource{%c8}) -> !stream.resource{%c8} + %4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %3 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %4, %5 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After IPOPass (iree-util-ipo) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + stream.executable private @multiple_results_dispatch_0 { + stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + stream.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) { + %c0 = arith.constant 0 : index + %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %3 = tensor.empty() : tensor<2xf32> + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %5 = math.absf %in : f32 + linalg.yield %5 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %2 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%0[%c0 to %c8 for %c8]) : (!stream.resource{%c8}) -> !stream.resource{%c8} + %3 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1[%c0 to %c8 for %c8]) : (!stream.resource{%c8}) -> !stream.resource{%c8} + %4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %3 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %4, %5 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After VerifyAsyncAccessRangesPass (iree-stream-verify-async-access-ranges) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + stream.executable private @multiple_results_dispatch_0 { + stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + stream.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) { + %c0 = arith.constant 0 : index + %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %3 = tensor.empty() : tensor<2xf32> + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %5 = math.absf %in : f32 + linalg.yield %5 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %2 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%0[%c0 to %c8 for %c8]) : (!stream.resource{%c8}) -> !stream.resource{%c8} + %3 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%1[%c0 to %c8 for %c8]) : (!stream.resource{%c8}) -> !stream.resource{%c8} + %4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %3 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %4, %5 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After ScheduleExecutionPass (iree-stream-schedule-execution) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %results:2, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}) -> (!stream.resource{%c8}, !stream.resource{%c8}) { + %5 = stream.async.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%arg2[%c0 to %c8 for %c8]) : (!stream.resource{%c8}) -> !stream.resource{%c8} + %6 = stream.async.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%arg3[%c0 to %c8 for %c8]) : (!stream.resource{%c8}) -> !stream.resource{%c8} + stream.yield %5, %6 : !stream.resource{%c8}, !stream.resource{%c8} + } => !stream.timepoint + %2:2 = stream.timepoint.await %result_timepoint => %results#0, %results#1 : !stream.resource{%c8}, !stream.resource{%c8} + %3 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2#0 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2#1 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %3, %4 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After ScheduleConcurrencyPass (iree-stream-schedule-concurrency) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %results:2, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}) -> (!stream.resource{%c8}, !stream.resource{%c8}) { + %5:2 = stream.async.concurrent with(%arg2 as %arg4: !stream.resource{%c8}, %arg3 as %arg5: !stream.resource{%c8}) -> (!stream.resource{%c8}, !stream.resource{%c8}) { + %6 = stream.async.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%arg4[%c0 to %c8 for %c8]) : (!stream.resource{%c8}) -> !stream.resource{%c8} + %7 = stream.async.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%arg5[%c0 to %c8 for %c8]) : (!stream.resource{%c8}) -> !stream.resource{%c8} + stream.yield %6, %7 : !stream.resource{%c8}, !stream.resource{%c8} + } + stream.yield %5#0, %5#1 : !stream.resource{%c8}, !stream.resource{%c8} + } => !stream.timepoint + %2:2 = stream.timepoint.await %result_timepoint => %results#0, %results#1 : !stream.resource{%c8}, !stream.resource{%c8} + %3 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2#0 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2#1 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %3, %4 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After SyncInitializersPass (iree-stream-sync-initializers) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + stream.executable private @multiple_results_dispatch_0 { + stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + stream.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) { + %c0 = arith.constant 0 : index + %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %3 = tensor.empty() : tensor<2xf32> + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %5 = math.absf %in : f32 + linalg.yield %5 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %results:2, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}) -> (!stream.resource{%c8}, !stream.resource{%c8}) { + %5:2 = stream.async.concurrent with(%arg2 as %arg4: !stream.resource{%c8}, %arg3 as %arg5: !stream.resource{%c8}) -> (!stream.resource{%c8}, !stream.resource{%c8}) { + %6 = stream.async.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%arg4[%c0 to %c8 for %c8]) : (!stream.resource{%c8}) -> !stream.resource{%c8} + %7 = stream.async.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%arg5[%c0 to %c8 for %c8]) : (!stream.resource{%c8}) -> !stream.resource{%c8} + stream.yield %6, %7 : !stream.resource{%c8}, !stream.resource{%c8} + } + stream.yield %5#0, %5#1 : !stream.resource{%c8}, !stream.resource{%c8} + } => !stream.timepoint + %2:2 = stream.timepoint.await %result_timepoint => %results#0, %results#1 : !stream.resource{%c8}, !stream.resource{%c8} + %3 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2#0 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2#1 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %3, %4 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After PropagateTimepointsPass (iree-stream-propagate-timepoints) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + stream.executable private @multiple_results_dispatch_0 { + stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + stream.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) { + %c0 = arith.constant 0 : index + %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %3 = tensor.empty() : tensor<2xf32> + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %5 = math.absf %in : f32 + linalg.yield %5 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %2 = stream.timepoint.immediate => !stream.timepoint + %3 = stream.timepoint.immediate => !stream.timepoint + %4 = stream.timepoint.join max(%2, %3) => !stream.timepoint + %results:2, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) await(%4) => with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}) -> (!stream.resource{%c8}, !stream.resource{%c8}) { + %8:2 = stream.async.concurrent with(%arg2 as %arg4: !stream.resource{%c8}, %arg3 as %arg5: !stream.resource{%c8}) -> (!stream.resource{%c8}, !stream.resource{%c8}) { + %9 = stream.async.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%arg4[%c0 to %c8 for %c8]) : (!stream.resource{%c8}) -> !stream.resource{%c8} + %10 = stream.async.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%arg5[%c0 to %c8 for %c8]) : (!stream.resource{%c8}) -> !stream.resource{%c8} + stream.yield %9, %10 : !stream.resource{%c8}, !stream.resource{%c8} + } + stream.yield %8#0, %8#1 : !stream.resource{%c8}, !stream.resource{%c8} + } => !stream.timepoint + %5:2 = stream.timepoint.await %result_timepoint => %results#0, %results#1 : !stream.resource{%c8}, !stream.resource{%c8} + %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5#0 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5#1 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %6, %7 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After MaterializeBuiltinsPass (iree-stream-materialize-builtins) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + stream.executable private @multiple_results_dispatch_0 { + stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + stream.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) { + %c0 = arith.constant 0 : index + %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %3 = tensor.empty() : tensor<2xf32> + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %5 = math.absf %in : f32 + linalg.yield %5 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %2 = stream.timepoint.immediate => !stream.timepoint + %3 = stream.timepoint.immediate => !stream.timepoint + %4 = stream.timepoint.join max(%2, %3) => !stream.timepoint + %results:2, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) await(%4) => with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}) -> (!stream.resource{%c8}, !stream.resource{%c8}) { + %8:2 = stream.async.concurrent with(%arg2 as %arg4: !stream.resource{%c8}, %arg3 as %arg5: !stream.resource{%c8}) -> (!stream.resource{%c8}, !stream.resource{%c8}) { + %9 = stream.async.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%arg4[%c0 to %c8 for %c8]) : (!stream.resource{%c8}) -> !stream.resource{%c8} + %10 = stream.async.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%arg5[%c0 to %c8 for %c8]) : (!stream.resource{%c8}) -> !stream.resource{%c8} + stream.yield %9, %10 : !stream.resource{%c8}, !stream.resource{%c8} + } + stream.yield %8#0, %8#1 : !stream.resource{%c8}, !stream.resource{%c8} + } => !stream.timepoint + %5:2 = stream.timepoint.await %result_timepoint => %results#0, %results#1 : !stream.resource{%c8}, !stream.resource{%c8} + %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5#0 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5#1 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %6, %7 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After Canonicalizer (canonicalize) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %results:2, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}) -> (!stream.resource{%c8}, !stream.resource{%c8}) { + %5:2 = stream.async.concurrent with(%arg2 as %arg4: !stream.resource{%c8}, %arg3 as %arg5: !stream.resource{%c8}) -> (!stream.resource{%c8}, !stream.resource{%c8}) { + %6 = stream.async.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%arg4[%c0 to %c8 for %c8]) : (!stream.resource{%c8}) -> !stream.resource{%c8} + %7 = stream.async.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%arg5[%c0 to %c8 for %c8]) : (!stream.resource{%c8}) -> !stream.resource{%c8} + stream.yield %6, %7 : !stream.resource{%c8}, !stream.resource{%c8} + } + stream.yield %5#0, %5#1 : !stream.resource{%c8}, !stream.resource{%c8} + } => !stream.timepoint + %2:2 = stream.timepoint.await %result_timepoint => %results#0, %results#1 : !stream.resource{%c8}, !stream.resource{%c8} + %3 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2#0 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2#1 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %3, %4 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After CSE (cse) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %results:2, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}) -> (!stream.resource{%c8}, !stream.resource{%c8}) { + %5:2 = stream.async.concurrent with(%arg2 as %arg4: !stream.resource{%c8}, %arg3 as %arg5: !stream.resource{%c8}) -> (!stream.resource{%c8}, !stream.resource{%c8}) { + %6 = stream.async.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%arg4[%c0 to %c8 for %c8]) : (!stream.resource{%c8}) -> !stream.resource{%c8} + %7 = stream.async.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%arg5[%c0 to %c8 for %c8]) : (!stream.resource{%c8}) -> !stream.resource{%c8} + stream.yield %6, %7 : !stream.resource{%c8}, !stream.resource{%c8} + } + stream.yield %5#0, %5#1 : !stream.resource{%c8}, !stream.resource{%c8} + } => !stream.timepoint + %2:2 = stream.timepoint.await %result_timepoint => %results#0, %results#1 : !stream.resource{%c8}, !stream.resource{%c8} + %3 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2#0 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2#1 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %3, %4 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %results:2, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}) -> (!stream.resource{%c8}, !stream.resource{%c8}) { + %5:2 = stream.async.concurrent with(%arg2 as %arg4: !stream.resource{%c8}, %arg3 as %arg5: !stream.resource{%c8}) -> (!stream.resource{%c8}, !stream.resource{%c8}) { + %6 = stream.async.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%arg4[%c0 to %c8 for %c8]) : (!stream.resource{%c8}) -> !stream.resource{%c8} + %7 = stream.async.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%arg5[%c0 to %c8 for %c8]) : (!stream.resource{%c8}) -> !stream.resource{%c8} + stream.yield %6, %7 : !stream.resource{%c8}, !stream.resource{%c8} + } + stream.yield %5#0, %5#1 : !stream.resource{%c8}, !stream.resource{%c8} + } => !stream.timepoint + %2:2 = stream.timepoint.await %result_timepoint => %results#0, %results#1 : !stream.resource{%c8}, !stream.resource{%c8} + %3 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2#0 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2#1 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %3, %4 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %results:2, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}) -> (!stream.resource{%c8}, !stream.resource{%c8}) { + %5:2 = stream.async.concurrent with(%arg2 as %arg4: !stream.resource{%c8}, %arg3 as %arg5: !stream.resource{%c8}) -> (!stream.resource{%c8}, !stream.resource{%c8}) { + %6 = stream.async.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%arg4[%c0 to %c8 for %c8]) : (!stream.resource{%c8}) -> !stream.resource{%c8} + %7 = stream.async.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%arg5[%c0 to %c8 for %c8]) : (!stream.resource{%c8}) -> !stream.resource{%c8} + stream.yield %6, %7 : !stream.resource{%c8}, !stream.resource{%c8} + } + stream.yield %5#0, %5#1 : !stream.resource{%c8}, !stream.resource{%c8} + } => !stream.timepoint + %2:2 = stream.timepoint.await %result_timepoint => %results#0, %results#1 : !stream.resource{%c8}, !stream.resource{%c8} + %3 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2#0 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2#1 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %3, %4 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %results:2, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}) -> (!stream.resource{%c8}, !stream.resource{%c8}) { + %5:2 = stream.async.concurrent with(%arg2 as %arg4: !stream.resource{%c8}, %arg3 as %arg5: !stream.resource{%c8}) -> (!stream.resource{%c8}, !stream.resource{%c8}) { + %6 = stream.async.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%arg4[%c0 to %c8 for %c8]) : (!stream.resource{%c8}) -> !stream.resource{%c8} + %7 = stream.async.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%arg5[%c0 to %c8 for %c8]) : (!stream.resource{%c8}) -> !stream.resource{%c8} + stream.yield %6, %7 : !stream.resource{%c8}, !stream.resource{%c8} + } + stream.yield %5#0, %5#1 : !stream.resource{%c8}, !stream.resource{%c8} + } => !stream.timepoint + %2:2 = stream.timepoint.await %result_timepoint => %results#0, %results#1 : !stream.resource{%c8}, !stream.resource{%c8} + %3 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2#0 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2#1 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %3, %4 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + stream.executable private @multiple_results_dispatch_0 { + stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + stream.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) { + %c0 = arith.constant 0 : index + %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %3 = tensor.empty() : tensor<2xf32> + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %5 = math.absf %in : f32 + linalg.yield %5 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %results:2, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}) -> (!stream.resource{%c8}, !stream.resource{%c8}) { + %5:2 = stream.async.concurrent with(%arg2 as %arg4: !stream.resource{%c8}, %arg3 as %arg5: !stream.resource{%c8}) -> (!stream.resource{%c8}, !stream.resource{%c8}) { + %6 = stream.async.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%arg4[%c0 to %c8 for %c8]) : (!stream.resource{%c8}) -> !stream.resource{%c8} + %7 = stream.async.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%arg5[%c0 to %c8 for %c8]) : (!stream.resource{%c8}) -> !stream.resource{%c8} + stream.yield %6, %7 : !stream.resource{%c8}, !stream.resource{%c8} + } + stream.yield %5#0, %5#1 : !stream.resource{%c8}, !stream.resource{%c8} + } => !stream.timepoint + %2:2 = stream.timepoint.await %result_timepoint => %results#0, %results#1 : !stream.resource{%c8}, !stream.resource{%c8} + %3 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2#0 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2#1 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %3, %4 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + stream.executable private @multiple_results_dispatch_0 { + stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + stream.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) { + %c0 = arith.constant 0 : index + %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %3 = tensor.empty() : tensor<2xf32> + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %5 = math.absf %in : f32 + linalg.yield %5 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %results:2, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}) -> (!stream.resource{%c8}, !stream.resource{%c8}) { + %5:2 = stream.async.concurrent with(%arg2 as %arg4: !stream.resource{%c8}, %arg3 as %arg5: !stream.resource{%c8}) -> (!stream.resource{%c8}, !stream.resource{%c8}) { + %6 = stream.async.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%arg4[%c0 to %c8 for %c8]) : (!stream.resource{%c8}) -> !stream.resource{%c8} + %7 = stream.async.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%arg5[%c0 to %c8 for %c8]) : (!stream.resource{%c8}) -> !stream.resource{%c8} + stream.yield %6, %7 : !stream.resource{%c8}, !stream.resource{%c8} + } + stream.yield %5#0, %5#1 : !stream.resource{%c8}, !stream.resource{%c8} + } => !stream.timepoint + %2:2 = stream.timepoint.await %result_timepoint => %results#0, %results#1 : !stream.resource{%c8}, !stream.resource{%c8} + %3 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2#0 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2#1 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %3, %4 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After IPOPass (iree-util-ipo) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + stream.executable private @multiple_results_dispatch_0 { + stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + stream.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) { + %c0 = arith.constant 0 : index + %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %3 = tensor.empty() : tensor<2xf32> + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %5 = math.absf %in : f32 + linalg.yield %5 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %results:2, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}) -> (!stream.resource{%c8}, !stream.resource{%c8}) { + %5:2 = stream.async.concurrent with(%arg2 as %arg4: !stream.resource{%c8}, %arg3 as %arg5: !stream.resource{%c8}) -> (!stream.resource{%c8}, !stream.resource{%c8}) { + %6 = stream.async.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%arg4[%c0 to %c8 for %c8]) : (!stream.resource{%c8}) -> !stream.resource{%c8} + %7 = stream.async.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%arg5[%c0 to %c8 for %c8]) : (!stream.resource{%c8}) -> !stream.resource{%c8} + stream.yield %6, %7 : !stream.resource{%c8}, !stream.resource{%c8} + } + stream.yield %5#0, %5#1 : !stream.resource{%c8}, !stream.resource{%c8} + } => !stream.timepoint + %2:2 = stream.timepoint.await %result_timepoint => %results#0, %results#1 : !stream.resource{%c8}, !stream.resource{%c8} + %3 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2#0 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2#1 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %3, %4 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After VerifyLoweringToAsyncPass (iree-stream-verify-lowering-to-async) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + stream.executable private @multiple_results_dispatch_0 { + stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + stream.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) { + %c0 = arith.constant 0 : index + %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %3 = tensor.empty() : tensor<2xf32> + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %5 = math.absf %in : f32 + linalg.yield %5 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %results:2, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}) -> (!stream.resource{%c8}, !stream.resource{%c8}) { + %5:2 = stream.async.concurrent with(%arg2 as %arg4: !stream.resource{%c8}, %arg3 as %arg5: !stream.resource{%c8}) -> (!stream.resource{%c8}, !stream.resource{%c8}) { + %6 = stream.async.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%arg4[%c0 to %c8 for %c8]) : (!stream.resource{%c8}) -> !stream.resource{%c8} + %7 = stream.async.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%arg5[%c0 to %c8 for %c8]) : (!stream.resource{%c8}) -> !stream.resource{%c8} + stream.yield %6, %7 : !stream.resource{%c8}, !stream.resource{%c8} + } + stream.yield %5#0, %5#1 : !stream.resource{%c8}, !stream.resource{%c8} + } => !stream.timepoint + %2:2 = stream.timepoint.await %result_timepoint => %results#0, %results#1 : !stream.resource{%c8}, !stream.resource{%c8} + %3 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2#0 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2#1 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %3, %4 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After ScheduleAllocationPass (iree-stream-schedule-allocation) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + stream.executable private @multiple_results_dispatch_0 { + stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + stream.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) { + %c0 = arith.constant 0 : index + %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %3 = tensor.empty() : tensor<2xf32> + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %5 = math.absf %in : f32 + linalg.yield %5 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %c0_0 = arith.constant 0 : index + %2:3 = stream.resource.pack on(#hal.device.affinity<@__device_0>) slices({ + [0, 0] = %c8, + [0, 0] = %c8 + }) : index + %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%2#0} => !stream.timepoint + %3 = stream.resource.subview %result[%2#1] : !stream.resource{%2#0} -> !stream.resource{%c8} + %4 = stream.resource.subview %result[%2#2] : !stream.resource{%2#0} -> !stream.resource{%c8} + %5 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}, %3 as %arg4: !stream.resource{%c8}, %4 as %arg5: !stream.resource{%c8}) { + stream.cmd.concurrent { + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 { + ro %arg2[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0_0 for %c8] : !stream.resource{%c8} + } + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 { + ro %arg3[%c0 for %c8] : !stream.resource{%c8}, + wo %arg5[%c0_0 for %c8] : !stream.resource{%c8} + } + } + } => !stream.timepoint + %6:2 = stream.timepoint.await %5 => %3, %4 : !stream.resource{%c8}, !stream.resource{%c8} + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6#0 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6#1 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %7, %8 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After EmplaceTransientsPass (iree-stream-emplace-transients) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + stream.executable private @multiple_results_dispatch_0 { + stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + stream.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) { + %c0 = arith.constant 0 : index + %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %3 = tensor.empty() : tensor<2xf32> + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %5 = math.absf %in : f32 + linalg.yield %5 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %c0_0 = arith.constant 0 : index + %2:3 = stream.resource.pack on(#hal.device.affinity<@__device_0>) slices({ + [0, 0] = %c8, + [0, 0] = %c8 + }) : index + %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%2#0} => !stream.timepoint + %3 = stream.resource.subview %result[%2#1] : !stream.resource{%2#0} -> !stream.resource{%c8} + %4 = stream.resource.subview %result[%2#2] : !stream.resource{%2#0} -> !stream.resource{%c8} + %5 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}, %3 as %arg4: !stream.resource{%c8}, %4 as %arg5: !stream.resource{%c8}) { + stream.cmd.concurrent { + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 { + ro %arg2[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0_0 for %c8] : !stream.resource{%c8} + } + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 { + ro %arg3[%c0 for %c8] : !stream.resource{%c8}, + wo %arg5[%c0_0 for %c8] : !stream.resource{%c8} + } + } + } => !stream.timepoint + %6:2 = stream.timepoint.await %5 => %3, %4 : !stream.resource{%c8}, !stream.resource{%c8} + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6#0 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6#1 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %7, %8 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After MaterializeTransientSizeQueriesPass (iree-stream-materialize-transient-size-queries) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + stream.executable private @multiple_results_dispatch_0 { + stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + stream.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) { + %c0 = arith.constant 0 : index + %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %3 = tensor.empty() : tensor<2xf32> + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %5 = math.absf %in : f32 + linalg.yield %5 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %c0_0 = arith.constant 0 : index + %2:3 = stream.resource.pack on(#hal.device.affinity<@__device_0>) slices({ + [0, 0] = %c8, + [0, 0] = %c8 + }) : index + %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%2#0} => !stream.timepoint + %3 = stream.resource.subview %result[%2#1] : !stream.resource{%2#0} -> !stream.resource{%c8} + %4 = stream.resource.subview %result[%2#2] : !stream.resource{%2#0} -> !stream.resource{%c8} + %5 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}, %3 as %arg4: !stream.resource{%c8}, %4 as %arg5: !stream.resource{%c8}) { + stream.cmd.concurrent { + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 { + ro %arg2[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0_0 for %c8] : !stream.resource{%c8} + } + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 { + ro %arg3[%c0 for %c8] : !stream.resource{%c8}, + wo %arg5[%c0_0 for %c8] : !stream.resource{%c8} + } + } + } => !stream.timepoint + %6:2 = stream.timepoint.await %5 => %3, %4 : !stream.resource{%c8}, !stream.resource{%c8} + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6#0 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6#1 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %7, %8 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After PackConstantsPass (iree-stream-pack-constants) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %c0_0 = arith.constant 0 : index + %2:3 = stream.resource.pack on(#hal.device.affinity<@__device_0>) slices({ + [0, 0] = %c8, + [0, 0] = %c8 + }) : index + %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%2#0} => !stream.timepoint + %3 = stream.resource.subview %result[%2#1] : !stream.resource{%2#0} -> !stream.resource{%c8} + %4 = stream.resource.subview %result[%2#2] : !stream.resource{%2#0} -> !stream.resource{%c8} + %5 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}, %3 as %arg4: !stream.resource{%c8}, %4 as %arg5: !stream.resource{%c8}) { + stream.cmd.concurrent { + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 { + ro %arg2[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0_0 for %c8] : !stream.resource{%c8} + } + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 { + ro %arg3[%c0 for %c8] : !stream.resource{%c8}, + wo %arg5[%c0_0 for %c8] : !stream.resource{%c8} + } + } + } => !stream.timepoint + %6:2 = stream.timepoint.await %5 => %3, %4 : !stream.resource{%c8}, !stream.resource{%c8} + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6#0 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6#1 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %7, %8 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After LayoutSlicesPass (iree-stream-layout-slices) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %c0_0 = arith.constant 0 : index + %c0_1 = arith.constant 0 : index + %c64 = arith.constant 64 : index + %c64_2 = arith.constant 64 : index + %c128 = arith.constant 128 : index + %c128_3 = arith.constant 128 : index + %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c128_3} => !stream.timepoint + %2 = stream.resource.subview %result[%c0_1] : !stream.resource{%c128_3} -> !stream.resource{%c8} + %3 = stream.resource.subview %result[%c64_2] : !stream.resource{%c128_3} -> !stream.resource{%c8} + %4 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}, %2 as %arg4: !stream.resource{%c8}, %3 as %arg5: !stream.resource{%c8}) { + stream.cmd.concurrent { + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 { + ro %arg2[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0_0 for %c8] : !stream.resource{%c8} + } + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 { + ro %arg3[%c0 for %c8] : !stream.resource{%c8}, + wo %arg5[%c0_0 for %c8] : !stream.resource{%c8} + } + } + } => !stream.timepoint + %5:2 = stream.timepoint.await %4 => %2, %3 : !stream.resource{%c8}, !stream.resource{%c8} + %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5#0 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5#1 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %6, %7 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After Canonicalizer (canonicalize) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c0 = arith.constant 0 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c128} => !stream.timepoint + %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}, %result as %arg4: !stream.resource{%c128}) { + stream.cmd.concurrent { + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 { + ro %arg2[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c8] : !stream.resource{%c128} + } + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 { + ro %arg3[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c64 for %c8] : !stream.resource{%c128} + } + } + } => !stream.timepoint + %3 = stream.timepoint.await %2 => %result : !stream.resource{%c128} + %4 = stream.resource.subview %3[%c0] : !stream.resource{%c128} -> !stream.resource{%c8} + %5 = stream.resource.subview %3[%c64] : !stream.resource{%c128} -> !stream.resource{%c8} + %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %6, %7 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After PropagateSubrangesPass (iree-util-propagate-subranges) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + stream.executable private @multiple_results_dispatch_0 { + stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + stream.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) { + %c0 = arith.constant 0 : index + %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %3 = tensor.empty() : tensor<2xf32> + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %5 = math.absf %in : f32 + linalg.yield %5 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c0_0 = arith.constant 0 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c128} => !stream.timepoint + %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}, %result as %arg4: !stream.resource{%c128}) { + stream.cmd.concurrent { + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 { + ro %arg2[%c0_0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0_0 for %c8] : !stream.resource{%c128} + } + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 { + ro %arg3[%c0_0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c64 for %c8] : !stream.resource{%c128} + } + } + } => !stream.timepoint + %3 = stream.timepoint.await %2 => %result : !stream.resource{%c128} + %4 = stream.resource.subview %3[%c0_0] : !stream.resource{%c128} -> !stream.resource{%c8} + %5 = stream.resource.subview %3[%c64] : !stream.resource{%c128} -> !stream.resource{%c8} + %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %6, %7 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After Canonicalizer (canonicalize) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c128} => !stream.timepoint + %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}, %result as %arg4: !stream.resource{%c128}) { + stream.cmd.concurrent { + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 { + ro %arg2[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c8] : !stream.resource{%c128} + } + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 { + ro %arg3[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c64 for %c8] : !stream.resource{%c128} + } + } + } => !stream.timepoint + %3 = stream.timepoint.await %2 => %result : !stream.resource{%c128} + %4 = stream.resource.subview %3[%c0] : !stream.resource{%c128} -> !stream.resource{%c8} + %5 = stream.resource.subview %3[%c64] : !stream.resource{%c128} -> !stream.resource{%c8} + %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %6, %7 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After CSE (cse) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c128} => !stream.timepoint + %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}, %result as %arg4: !stream.resource{%c128}) { + stream.cmd.concurrent { + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 { + ro %arg2[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c8] : !stream.resource{%c128} + } + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 { + ro %arg3[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c64 for %c8] : !stream.resource{%c128} + } + } + } => !stream.timepoint + %3 = stream.timepoint.await %2 => %result : !stream.resource{%c128} + %4 = stream.resource.subview %3[%c0] : !stream.resource{%c128} -> !stream.resource{%c8} + %5 = stream.resource.subview %3[%c64] : !stream.resource{%c128} -> !stream.resource{%c8} + %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %6, %7 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c128} => !stream.timepoint + %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}, %result as %arg4: !stream.resource{%c128}) { + stream.cmd.concurrent { + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 { + ro %arg2[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c8] : !stream.resource{%c128} + } + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 { + ro %arg3[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c64 for %c8] : !stream.resource{%c128} + } + } + } => !stream.timepoint + %3 = stream.timepoint.await %2 => %result : !stream.resource{%c128} + %4 = stream.resource.subview %3[%c0] : !stream.resource{%c128} -> !stream.resource{%c8} + %5 = stream.resource.subview %3[%c64] : !stream.resource{%c128} -> !stream.resource{%c8} + %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %6, %7 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c128} => !stream.timepoint + %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}, %result as %arg4: !stream.resource{%c128}) { + stream.cmd.concurrent { + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 { + ro %arg2[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c8] : !stream.resource{%c128} + } + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 { + ro %arg3[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c64 for %c8] : !stream.resource{%c128} + } + } + } => !stream.timepoint + %3 = stream.timepoint.await %2 => %result : !stream.resource{%c128} + %4 = stream.resource.subview %3[%c0] : !stream.resource{%c128} -> !stream.resource{%c8} + %5 = stream.resource.subview %3[%c64] : !stream.resource{%c128} -> !stream.resource{%c8} + %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %6, %7 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c128} => !stream.timepoint + %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}, %result as %arg4: !stream.resource{%c128}) { + stream.cmd.concurrent { + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 { + ro %arg2[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c8] : !stream.resource{%c128} + } + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 { + ro %arg3[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c64 for %c8] : !stream.resource{%c128} + } + } + } => !stream.timepoint + %3 = stream.timepoint.await %2 => %result : !stream.resource{%c128} + %4 = stream.resource.subview %3[%c0] : !stream.resource{%c128} -> !stream.resource{%c8} + %5 = stream.resource.subview %3[%c64] : !stream.resource{%c128} -> !stream.resource{%c8} + %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %6, %7 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + stream.executable private @multiple_results_dispatch_0 { + stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + stream.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) { + %c0 = arith.constant 0 : index + %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %3 = tensor.empty() : tensor<2xf32> + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %5 = math.absf %in : f32 + linalg.yield %5 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c128} => !stream.timepoint + %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}, %result as %arg4: !stream.resource{%c128}) { + stream.cmd.concurrent { + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 { + ro %arg2[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c8] : !stream.resource{%c128} + } + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 { + ro %arg3[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c64 for %c8] : !stream.resource{%c128} + } + } + } => !stream.timepoint + %3 = stream.timepoint.await %2 => %result : !stream.resource{%c128} + %4 = stream.resource.subview %3[%c0] : !stream.resource{%c128} -> !stream.resource{%c8} + %5 = stream.resource.subview %3[%c64] : !stream.resource{%c128} -> !stream.resource{%c8} + %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %6, %7 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + stream.executable private @multiple_results_dispatch_0 { + stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + stream.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) { + %c0 = arith.constant 0 : index + %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %3 = tensor.empty() : tensor<2xf32> + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %5 = math.absf %in : f32 + linalg.yield %5 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c128} => !stream.timepoint + %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}, %result as %arg4: !stream.resource{%c128}) { + stream.cmd.concurrent { + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 { + ro %arg2[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c8] : !stream.resource{%c128} + } + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 { + ro %arg3[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c64 for %c8] : !stream.resource{%c128} + } + } + } => !stream.timepoint + %3 = stream.timepoint.await %2 => %result : !stream.resource{%c128} + %4 = stream.resource.subview %3[%c0] : !stream.resource{%c128} -> !stream.resource{%c8} + %5 = stream.resource.subview %3[%c64] : !stream.resource{%c128} -> !stream.resource{%c8} + %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %6, %7 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After IPOPass (iree-util-ipo) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + stream.executable private @multiple_results_dispatch_0 { + stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + stream.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) { + %c0 = arith.constant 0 : index + %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %3 = tensor.empty() : tensor<2xf32> + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %5 = math.absf %in : f32 + linalg.yield %5 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c128} => !stream.timepoint + %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}, %result as %arg4: !stream.resource{%c128}) { + stream.cmd.concurrent { + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 { + ro %arg2[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c8] : !stream.resource{%c128} + } + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 { + ro %arg3[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c64 for %c8] : !stream.resource{%c128} + } + } + } => !stream.timepoint + %3 = stream.timepoint.await %2 => %result : !stream.resource{%c128} + %4 = stream.resource.subview %3[%c0] : !stream.resource{%c128} -> !stream.resource{%c8} + %5 = stream.resource.subview %3[%c64] : !stream.resource{%c128} -> !stream.resource{%c8} + %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %6, %7 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After AutomaticReferenceCountingPass (iree-stream-automatic-reference-counting) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + stream.executable private @multiple_results_dispatch_0 { + stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + stream.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) { + %c0 = arith.constant 0 : index + %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %3 = tensor.empty() : tensor<2xf32> + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %5 = math.absf %in : f32 + linalg.yield %5 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c128} => !stream.timepoint + %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}, %result as %arg4: !stream.resource{%c128}) { + stream.cmd.concurrent { + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 { + ro %arg2[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c8] : !stream.resource{%c128} + } + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 { + ro %arg3[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c64 for %c8] : !stream.resource{%c128} + } + } + } => !stream.timepoint + %3 = stream.timepoint.await %2 => %result : !stream.resource{%c128} + %4 = stream.resource.subview %3[%c0] : !stream.resource{%c128} -> !stream.resource{%c8} + %5 = stream.resource.subview %3[%c64] : !stream.resource{%c128} -> !stream.resource{%c8} + %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %6, %7 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After AnnotateConstantTransientSizePass (iree-stream-annotate-constant-transient-size) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + stream.executable private @multiple_results_dispatch_0 { + stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + stream.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) { + %c0 = arith.constant 0 : index + %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %3 = tensor.empty() : tensor<2xf32> + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %5 = math.absf %in : f32 + linalg.yield %5 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c128} => !stream.timepoint + %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}, %result as %arg4: !stream.resource{%c128}) { + stream.cmd.concurrent { + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 { + ro %arg2[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c8] : !stream.resource{%c128} + } + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 { + ro %arg3[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c64 for %c8] : !stream.resource{%c128} + } + } + } => !stream.timepoint + %3 = stream.timepoint.await %2 => %result : !stream.resource{%c128} + %4 = stream.resource.subview %3[%c0] : !stream.resource{%c128} -> !stream.resource{%c8} + %5 = stream.resource.subview %3[%c64] : !stream.resource{%c128} -> !stream.resource{%c8} + %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %6, %7 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After VerifyLoweringToCmdPass (iree-stream-verify-lowering-to-cmd) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + stream.executable private @multiple_results_dispatch_0 { + stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + stream.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) { + %c0 = arith.constant 0 : index + %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %3 = tensor.empty() : tensor<2xf32> + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %5 = math.absf %in : f32 + linalg.yield %5 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c128} => !stream.timepoint + %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}, %result as %arg4: !stream.resource{%c128}) { + stream.cmd.concurrent { + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 { + ro %arg2[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c8] : !stream.resource{%c128} + } + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 { + ro %arg3[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c64 for %c8] : !stream.resource{%c128} + } + } + } => !stream.timepoint + %3 = stream.timepoint.await %2 => %result : !stream.resource{%c128} + %4 = stream.resource.subview %3[%c0] : !stream.resource{%c128} -> !stream.resource{%c8} + %5 = stream.resource.subview %3[%c64] : !stream.resource{%c128} -> !stream.resource{%c8} + %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %6, %7 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After Canonicalizer (canonicalize) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c128} => !stream.timepoint + %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}, %result as %arg4: !stream.resource{%c128}) { + stream.cmd.concurrent { + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 { + ro %arg2[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c8] : !stream.resource{%c128} + } + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 { + ro %arg3[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c64 for %c8] : !stream.resource{%c128} + } + } + } => !stream.timepoint + %3 = stream.timepoint.await %2 => %result : !stream.resource{%c128} + %4 = stream.resource.subview %3[%c0] : !stream.resource{%c128} -> !stream.resource{%c8} + %5 = stream.resource.subview %3[%c64] : !stream.resource{%c128} -> !stream.resource{%c8} + %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %6, %7 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After CSE (cse) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c128} => !stream.timepoint + %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}, %result as %arg4: !stream.resource{%c128}) { + stream.cmd.concurrent { + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 { + ro %arg2[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c8] : !stream.resource{%c128} + } + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 { + ro %arg3[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c64 for %c8] : !stream.resource{%c128} + } + } + } => !stream.timepoint + %3 = stream.timepoint.await %2 => %result : !stream.resource{%c128} + %4 = stream.resource.subview %3[%c0] : !stream.resource{%c128} -> !stream.resource{%c8} + %5 = stream.resource.subview %3[%c64] : !stream.resource{%c128} -> !stream.resource{%c8} + %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %6, %7 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c128} => !stream.timepoint + %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}, %result as %arg4: !stream.resource{%c128}) { + stream.cmd.concurrent { + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 { + ro %arg2[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c8] : !stream.resource{%c128} + } + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 { + ro %arg3[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c64 for %c8] : !stream.resource{%c128} + } + } + } => !stream.timepoint + %3 = stream.timepoint.await %2 => %result : !stream.resource{%c128} + %4 = stream.resource.subview %3[%c0] : !stream.resource{%c128} -> !stream.resource{%c8} + %5 = stream.resource.subview %3[%c64] : !stream.resource{%c128} -> !stream.resource{%c8} + %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %6, %7 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c128} => !stream.timepoint + %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}, %result as %arg4: !stream.resource{%c128}) { + stream.cmd.concurrent { + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 { + ro %arg2[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c8] : !stream.resource{%c128} + } + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 { + ro %arg3[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c64 for %c8] : !stream.resource{%c128} + } + } + } => !stream.timepoint + %3 = stream.timepoint.await %2 => %result : !stream.resource{%c128} + %4 = stream.resource.subview %3[%c0] : !stream.resource{%c128} -> !stream.resource{%c8} + %5 = stream.resource.subview %3[%c64] : !stream.resource{%c128} -> !stream.resource{%c8} + %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %6, %7 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c128} => !stream.timepoint + %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}, %result as %arg4: !stream.resource{%c128}) { + stream.cmd.concurrent { + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 { + ro %arg2[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c8] : !stream.resource{%c128} + } + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 { + ro %arg3[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c64 for %c8] : !stream.resource{%c128} + } + } + } => !stream.timepoint + %3 = stream.timepoint.await %2 => %result : !stream.resource{%c128} + %4 = stream.resource.subview %3[%c0] : !stream.resource{%c128} -> !stream.resource{%c8} + %5 = stream.resource.subview %3[%c64] : !stream.resource{%c128} -> !stream.resource{%c8} + %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %6, %7 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + stream.executable private @multiple_results_dispatch_0 { + stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + stream.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) { + %c0 = arith.constant 0 : index + %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %3 = tensor.empty() : tensor<2xf32> + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %5 = math.absf %in : f32 + linalg.yield %5 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c128} => !stream.timepoint + %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}, %result as %arg4: !stream.resource{%c128}) { + stream.cmd.concurrent { + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 { + ro %arg2[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c8] : !stream.resource{%c128} + } + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 { + ro %arg3[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c64 for %c8] : !stream.resource{%c128} + } + } + } => !stream.timepoint + %3 = stream.timepoint.await %2 => %result : !stream.resource{%c128} + %4 = stream.resource.subview %3[%c0] : !stream.resource{%c128} -> !stream.resource{%c8} + %5 = stream.resource.subview %3[%c64] : !stream.resource{%c128} -> !stream.resource{%c8} + %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %6, %7 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + stream.executable private @multiple_results_dispatch_0 { + stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + stream.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) { + %c0 = arith.constant 0 : index + %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %3 = tensor.empty() : tensor<2xf32> + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %5 = math.absf %in : f32 + linalg.yield %5 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c128} => !stream.timepoint + %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}, %result as %arg4: !stream.resource{%c128}) { + stream.cmd.concurrent { + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 { + ro %arg2[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c8] : !stream.resource{%c128} + } + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 { + ro %arg3[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c64 for %c8] : !stream.resource{%c128} + } + } + } => !stream.timepoint + %3 = stream.timepoint.await %2 => %result : !stream.resource{%c128} + %4 = stream.resource.subview %3[%c0] : !stream.resource{%c128} -> !stream.resource{%c8} + %5 = stream.resource.subview %3[%c64] : !stream.resource{%c128} -> !stream.resource{%c8} + %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %6, %7 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After IPOPass (iree-util-ipo) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + stream.executable private @multiple_results_dispatch_0 { + stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + stream.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) { + %c0 = arith.constant 0 : index + %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %3 = tensor.empty() : tensor<2xf32> + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %5 = math.absf %in : f32 + linalg.yield %5 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c128} => !stream.timepoint + %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}, %result as %arg4: !stream.resource{%c128}) { + stream.cmd.concurrent { + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 { + ro %arg2[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c8] : !stream.resource{%c128} + } + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 { + ro %arg3[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c64 for %c8] : !stream.resource{%c128} + } + } + } => !stream.timepoint + %3 = stream.timepoint.await %2 => %result : !stream.resource{%c128} + %4 = stream.resource.subview %3[%c0] : !stream.resource{%c128} -> !stream.resource{%c8} + %5 = stream.resource.subview %3[%c64] : !stream.resource{%c128} -> !stream.resource{%c8} + %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %6, %7 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After ReuseAllocationsPass (iree-stream-reuse-allocations) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c128} => !stream.timepoint + %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}, %result as %arg4: !stream.resource{%c128}) { + stream.cmd.concurrent { + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 { + ro %arg2[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c8] : !stream.resource{%c128} + } + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 { + ro %arg3[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c64 for %c8] : !stream.resource{%c128} + } + } + } => !stream.timepoint + %3 = stream.timepoint.await %2 => %result : !stream.resource{%c128} + %4 = stream.resource.subview %3[%c0] : !stream.resource{%c128} -> !stream.resource{%c8} + %5 = stream.resource.subview %3[%c64] : !stream.resource{%c128} -> !stream.resource{%c8} + %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %6, %7 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After SCFToControlFlowPass (convert-scf-to-cf) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c128} => !stream.timepoint + %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}, %result as %arg4: !stream.resource{%c128}) { + stream.cmd.concurrent { + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 { + ro %arg2[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c8] : !stream.resource{%c128} + } + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 { + ro %arg3[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c64 for %c8] : !stream.resource{%c128} + } + } + } => !stream.timepoint + %3 = stream.timepoint.await %2 => %result : !stream.resource{%c128} + %4 = stream.resource.subview %3[%c0] : !stream.resource{%c128} -> !stream.resource{%c8} + %5 = stream.resource.subview %3[%c64] : !stream.resource{%c128} -> !stream.resource{%c8} + %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %6, %7 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After Canonicalizer (canonicalize) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c128} => !stream.timepoint + %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}, %result as %arg4: !stream.resource{%c128}) { + stream.cmd.concurrent { + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 { + ro %arg2[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c8] : !stream.resource{%c128} + } + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 { + ro %arg3[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c64 for %c8] : !stream.resource{%c128} + } + } + } => !stream.timepoint + %3 = stream.timepoint.await %2 => %result : !stream.resource{%c128} + %4 = stream.resource.subview %3[%c0] : !stream.resource{%c128} -> !stream.resource{%c8} + %5 = stream.resource.subview %3[%c64] : !stream.resource{%c128} -> !stream.resource{%c8} + %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %6, %7 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After CSE (cse) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c128} => !stream.timepoint + %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}, %result as %arg4: !stream.resource{%c128}) { + stream.cmd.concurrent { + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 { + ro %arg2[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c8] : !stream.resource{%c128} + } + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 { + ro %arg3[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c64 for %c8] : !stream.resource{%c128} + } + } + } => !stream.timepoint + %3 = stream.timepoint.await %2 => %result : !stream.resource{%c128} + %4 = stream.resource.subview %3[%c0] : !stream.resource{%c128} -> !stream.resource{%c8} + %5 = stream.resource.subview %3[%c64] : !stream.resource{%c128} -> !stream.resource{%c8} + %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %6, %7 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c128} => !stream.timepoint + %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}, %result as %arg4: !stream.resource{%c128}) { + stream.cmd.concurrent { + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 { + ro %arg2[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c8] : !stream.resource{%c128} + } + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 { + ro %arg3[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c64 for %c8] : !stream.resource{%c128} + } + } + } => !stream.timepoint + %3 = stream.timepoint.await %2 => %result : !stream.resource{%c128} + %4 = stream.resource.subview %3[%c0] : !stream.resource{%c128} -> !stream.resource{%c8} + %5 = stream.resource.subview %3[%c64] : !stream.resource{%c128} -> !stream.resource{%c8} + %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %6, %7 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c128} => !stream.timepoint + %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}, %result as %arg4: !stream.resource{%c128}) { + stream.cmd.concurrent { + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 { + ro %arg2[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c8] : !stream.resource{%c128} + } + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 { + ro %arg3[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c64 for %c8] : !stream.resource{%c128} + } + } + } => !stream.timepoint + %3 = stream.timepoint.await %2 => %result : !stream.resource{%c128} + %4 = stream.resource.subview %3[%c0] : !stream.resource{%c128} -> !stream.resource{%c8} + %5 = stream.resource.subview %3[%c64] : !stream.resource{%c128} -> !stream.resource{%c8} + %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %6, %7 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c128} => !stream.timepoint + %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}, %result as %arg4: !stream.resource{%c128}) { + stream.cmd.concurrent { + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 { + ro %arg2[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c8] : !stream.resource{%c128} + } + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 { + ro %arg3[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c64 for %c8] : !stream.resource{%c128} + } + } + } => !stream.timepoint + %3 = stream.timepoint.await %2 => %result : !stream.resource{%c128} + %4 = stream.resource.subview %3[%c0] : !stream.resource{%c128} -> !stream.resource{%c8} + %5 = stream.resource.subview %3[%c64] : !stream.resource{%c128} -> !stream.resource{%c8} + %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %6, %7 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + stream.executable private @multiple_results_dispatch_0 { + stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + stream.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) { + %c0 = arith.constant 0 : index + %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %3 = tensor.empty() : tensor<2xf32> + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %5 = math.absf %in : f32 + linalg.yield %5 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c128} => !stream.timepoint + %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}, %result as %arg4: !stream.resource{%c128}) { + stream.cmd.concurrent { + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 { + ro %arg2[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c8] : !stream.resource{%c128} + } + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 { + ro %arg3[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c64 for %c8] : !stream.resource{%c128} + } + } + } => !stream.timepoint + %3 = stream.timepoint.await %2 => %result : !stream.resource{%c128} + %4 = stream.resource.subview %3[%c0] : !stream.resource{%c128} -> !stream.resource{%c8} + %5 = stream.resource.subview %3[%c64] : !stream.resource{%c128} -> !stream.resource{%c8} + %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %6, %7 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + stream.executable private @multiple_results_dispatch_0 { + stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + stream.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) { + %c0 = arith.constant 0 : index + %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %3 = tensor.empty() : tensor<2xf32> + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %5 = math.absf %in : f32 + linalg.yield %5 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c128} => !stream.timepoint + %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}, %result as %arg4: !stream.resource{%c128}) { + stream.cmd.concurrent { + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 { + ro %arg2[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c8] : !stream.resource{%c128} + } + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 { + ro %arg3[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c64 for %c8] : !stream.resource{%c128} + } + } + } => !stream.timepoint + %3 = stream.timepoint.await %2 => %result : !stream.resource{%c128} + %4 = stream.resource.subview %3[%c0] : !stream.resource{%c128} -> !stream.resource{%c8} + %5 = stream.resource.subview %3[%c64] : !stream.resource{%c128} -> !stream.resource{%c8} + %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %6, %7 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After IPOPass (iree-util-ipo) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + stream.executable private @multiple_results_dispatch_0 { + stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + stream.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) { + %c0 = arith.constant 0 : index + %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %3 = tensor.empty() : tensor<2xf32> + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %5 = math.absf %in : f32 + linalg.yield %5 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c128} => !stream.timepoint + %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}, %result as %arg4: !stream.resource{%c128}) { + stream.cmd.concurrent { + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 { + ro %arg2[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c8] : !stream.resource{%c128} + } + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 { + ro %arg3[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c64 for %c8] : !stream.resource{%c128} + } + } + } => !stream.timepoint + %3 = stream.timepoint.await %2 => %result : !stream.resource{%c128} + %4 = stream.resource.subview %3[%c0] : !stream.resource{%c128} -> !stream.resource{%c8} + %5 = stream.resource.subview %3[%c64] : !stream.resource{%c128} -> !stream.resource{%c8} + %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %6, %7 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After ElideTimepointsPass (iree-stream-elide-timepoints) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + stream.executable private @multiple_results_dispatch_0 { + stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + stream.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) { + %c0 = arith.constant 0 : index + %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %3 = tensor.empty() : tensor<2xf32> + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %5 = math.absf %in : f32 + linalg.yield %5 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c128} => !stream.timepoint + %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}, %result as %arg4: !stream.resource{%c128}) { + stream.cmd.concurrent { + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 { + ro %arg2[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c8] : !stream.resource{%c128} + } + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 { + ro %arg3[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c64 for %c8] : !stream.resource{%c128} + } + } + } => !stream.timepoint + %3 = stream.timepoint.await %2 => %result : !stream.resource{%c128} + %4 = stream.resource.subview %3[%c0] : !stream.resource{%c128} -> !stream.resource{%c8} + %5 = stream.resource.subview %3[%c64] : !stream.resource{%c128} -> !stream.resource{%c8} + %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %6, %7 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After FixedPointIteratorPass (iree-util-fixed-point-iterator) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + stream.executable private @multiple_results_dispatch_0 { + stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + stream.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding) { + %c0 = arith.constant 0 : index + %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %3 = tensor.empty() : tensor<2xf32> + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %5 = math.absf %in : f32 + linalg.yield %5 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c128} => !stream.timepoint + %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}, %result as %arg4: !stream.resource{%c128}) { + stream.cmd.concurrent { + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 { + ro %arg2[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c8] : !stream.resource{%c128} + } + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32 { + ro %arg3[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c64 for %c8] : !stream.resource{%c128} + } + } + } => !stream.timepoint + %3 = stream.timepoint.await %2 => %result : !stream.resource{%c128} + %4 = stream.resource.subview %3[%c0] : !stream.resource{%c128} -> !stream.resource{%c8} + %5 = stream.resource.subview %3[%c64] : !stream.resource{%c128} -> !stream.resource{%c8} + %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %6, %7 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After FuseDispatchBindingsPass (iree-stream-fuse-dispatch-bindings) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + stream.executable private @multiple_results_dispatch_0 { + stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + stream.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index) { + %c0 = arith.constant 0 : index + %0 = stream.binding.subspan %arg0[%arg2] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %1 = stream.binding.subspan %arg1[%arg3] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %3 = tensor.empty() : tensor<2xf32> + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %5 = math.absf %in : f32 + linalg.yield %5 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c128} => !stream.timepoint + %c0_0 = arith.constant 0 : index + %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}, %result as %arg4: !stream.resource{%c128}) { + stream.cmd.concurrent { + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0, %c0 : index, index) { + ro %arg2[%c0_0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0_0 for %c128] : !stream.resource{%c128} + } + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0, %c64 : index, index) { + ro %arg3[%c0_0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0_0 for %c128] : !stream.resource{%c128} + } + } + } => !stream.timepoint + %3 = stream.timepoint.await %2 => %result : !stream.resource{%c128} + %4 = stream.resource.subview %3[%c0] : !stream.resource{%c128} -> !stream.resource{%c8} + %5 = stream.resource.subview %3[%c64] : !stream.resource{%c128} -> !stream.resource{%c8} + %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %6, %7 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After AnnotateDispatchArgumentsPass (iree-stream-annotate-dispatch-arguments) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + stream.executable private @multiple_results_dispatch_0 { + stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + stream.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: index {stream.values = [0 : index]}, %arg3: index {stream.alignment = 64 : index, stream.values = [0 : index, 64 : index]}) { + %c0 = arith.constant 0 : index + %0 = stream.binding.subspan %arg0[%arg2] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %1 = stream.binding.subspan %arg1[%arg3] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %3 = tensor.empty() : tensor<2xf32> + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%2 : tensor<2xf32>) outs(%3 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %5 = math.absf %in : f32 + linalg.yield %5 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c128} => !stream.timepoint + %c0_0 = arith.constant 0 : index + %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}, %result as %arg4: !stream.resource{%c128}) { + stream.cmd.concurrent { + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0, %c0 : index, index) { + ro %arg2[%c0_0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0_0 for %c128] : !stream.resource{%c128} + } + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0, %c64 : index, index) { + ro %arg3[%c0_0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0_0 for %c128] : !stream.resource{%c128} + } + } + } => !stream.timepoint + %3 = stream.timepoint.await %2 => %result : !stream.resource{%c128} + %4 = stream.resource.subview %3[%c0] : !stream.resource{%c128} -> !stream.resource{%c8} + %5 = stream.resource.subview %3[%c64] : !stream.resource{%c128} -> !stream.resource{%c8} + %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %6, %7 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After AnnotateDispatchAssumptionsPass (iree-stream-annotate-dispatch-assumptions) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + stream.executable private @multiple_results_dispatch_0 { + stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + stream.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: index {stream.values = [0 : index]}, %arg3: index {stream.alignment = 64 : index, stream.values = [0 : index, 64 : index]}) { + %0:2 = util.assume.int + %arg2[, ], + %arg3[, ] + : index, index + %c0 = arith.constant 0 : index + %1 = stream.binding.subspan %arg0[%0#0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %2 = stream.binding.subspan %arg1[%0#1] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %3 = iree_tensor_ext.dispatch.tensor.load %1, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %4 = tensor.empty() : tensor<2xf32> + %5 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%4 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %6 = math.absf %in : f32 + linalg.yield %6 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %5, %2, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c128} => !stream.timepoint + %c0_0 = arith.constant 0 : index + %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}, %result as %arg4: !stream.resource{%c128}) { + stream.cmd.concurrent { + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0, %c0 : index, index) { + ro %arg2[%c0_0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0_0 for %c128] : !stream.resource{%c128} + } + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0, %c64 : index, index) { + ro %arg3[%c0_0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0_0 for %c128] : !stream.resource{%c128} + } + } + } => !stream.timepoint + %3 = stream.timepoint.await %2 => %result : !stream.resource{%c128} + %4 = stream.resource.subview %3[%c0] : !stream.resource{%c128} -> !stream.resource{%c8} + %5 = stream.resource.subview %3[%c64] : !stream.resource{%c128} -> !stream.resource{%c8} + %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %6, %7 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After PackDispatchOperandsPass (iree-stream-pack-dispatch-operands) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + stream.executable private @multiple_results_dispatch_0 { + stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + stream.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32) { + %0 = arith.extui %arg2 : i32 to i64 + %1 = arith.extui %arg3 : i32 to i64 + %c32_i64 = arith.constant 32 : i64 + %2 = arith.shli %1, %c32_i64 : i64 + %3 = arith.ori %0, %2 : i64 + %4 = arith.index_castui %3 {stream.values = [0 : index]} : i64 to index + %5 = arith.extui %arg4 : i32 to i64 + %6 = arith.extui %arg5 : i32 to i64 + %c32_i64_0 = arith.constant 32 : i64 + %7 = arith.shli %6, %c32_i64_0 : i64 + %8 = arith.ori %5, %7 : i64 + %9 = arith.index_castui %8 {stream.alignment = 64 : index, stream.values = [0 : index, 64 : index]} : i64 to index + %10:2 = util.assume.int + %4[, ], + %9[, ] + : index, index + %c0 = arith.constant 0 : index + %11 = stream.binding.subspan %arg0[%10#0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %12 = stream.binding.subspan %arg1[%10#1] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %13 = iree_tensor_ext.dispatch.tensor.load %11, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %14 = tensor.empty() : tensor<2xf32> + %15 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%13 : tensor<2xf32>) outs(%14 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %16 = math.absf %in : f32 + linalg.yield %16 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %15, %12, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c128} => !stream.timepoint + %c0_0 = arith.constant 0 : index + %c0_i64 = arith.constant 0 : i64 + %c0_i32 = arith.constant 0 : i32 + %c32_i64 = arith.constant 32 : i64 + %c0_i64_1 = arith.constant 0 : i64 + %c0_i32_2 = arith.constant 0 : i32 + %c0_i64_3 = arith.constant 0 : i64 + %c0_i32_4 = arith.constant 0 : i32 + %c32_i64_5 = arith.constant 32 : i64 + %c0_i64_6 = arith.constant 0 : i64 + %c0_i32_7 = arith.constant 0 : i32 + %c0_i64_8 = arith.constant 0 : i64 + %c0_i32_9 = arith.constant 0 : i32 + %c32_i64_10 = arith.constant 32 : i64 + %c0_i64_11 = arith.constant 0 : i64 + %c0_i32_12 = arith.constant 0 : i32 + %c64_i64 = arith.constant 64 : i64 + %c64_i32 = arith.constant 64 : i32 + %c32_i64_13 = arith.constant 32 : i64 + %c0_i64_14 = arith.constant 0 : i64 + %c0_i32_15 = arith.constant 0 : i32 + %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}, %result as %arg4: !stream.resource{%c128}) { + stream.cmd.concurrent { + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32, %c0_i32_2, %c0_i32_4, %c0_i32_7 : i32, i32, i32, i32) { + ro %arg2[%c0_0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0_0 for %c128] : !stream.resource{%c128} + } + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32_9, %c0_i32_12, %c64_i32, %c0_i32_15 : i32, i32, i32, i32) { + ro %arg3[%c0_0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0_0 for %c128] : !stream.resource{%c128} + } + } + } => !stream.timepoint + %3 = stream.timepoint.await %2 => %result : !stream.resource{%c128} + %4 = stream.resource.subview %3[%c0] : !stream.resource{%c128} -> !stream.resource{%c8} + %5 = stream.resource.subview %3[%c64] : !stream.resource{%c128} -> !stream.resource{%c8} + %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %6, %7 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After Canonicalizer (canonicalize) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c64_i32 = arith.constant 64 : i32 + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c128} => !stream.timepoint + %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}, %result as %arg4: !stream.resource{%c128}) { + stream.cmd.concurrent { + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32, %c0_i32, %c0_i32, %c0_i32 : i32, i32, i32, i32) { + ro %arg2[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c128] : !stream.resource{%c128} + } + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32, %c0_i32, %c64_i32, %c0_i32 : i32, i32, i32, i32) { + ro %arg3[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c128] : !stream.resource{%c128} + } + } + } => !stream.timepoint + %3 = stream.timepoint.await %2 => %result : !stream.resource{%c128} + %4 = stream.resource.subview %3[%c0] : !stream.resource{%c128} -> !stream.resource{%c8} + %5 = stream.resource.subview %3[%c64] : !stream.resource{%c128} -> !stream.resource{%c8} + %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %6, %7 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After CSE (cse) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c64_i32 = arith.constant 64 : i32 + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c128} => !stream.timepoint + %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}, %result as %arg4: !stream.resource{%c128}) { + stream.cmd.concurrent { + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32, %c0_i32, %c0_i32, %c0_i32 : i32, i32, i32, i32) { + ro %arg2[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c128] : !stream.resource{%c128} + } + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32, %c0_i32, %c64_i32, %c0_i32 : i32, i32, i32, i32) { + ro %arg3[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c128] : !stream.resource{%c128} + } + } + } => !stream.timepoint + %3 = stream.timepoint.await %2 => %result : !stream.resource{%c128} + %4 = stream.resource.subview %3[%c0] : !stream.resource{%c128} -> !stream.resource{%c8} + %5 = stream.resource.subview %3[%c64] : !stream.resource{%c128} -> !stream.resource{%c8} + %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %6, %7 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c64_i32 = arith.constant 64 : i32 + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c128} => !stream.timepoint + %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}, %result as %arg4: !stream.resource{%c128}) { + stream.cmd.concurrent { + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32, %c0_i32, %c0_i32, %c0_i32 : i32, i32, i32, i32) { + ro %arg2[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c128] : !stream.resource{%c128} + } + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32, %c0_i32, %c64_i32, %c0_i32 : i32, i32, i32, i32) { + ro %arg3[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c128] : !stream.resource{%c128} + } + } + } => !stream.timepoint + %3 = stream.timepoint.await %2 => %result : !stream.resource{%c128} + %4 = stream.resource.subview %3[%c0] : !stream.resource{%c128} -> !stream.resource{%c8} + %5 = stream.resource.subview %3[%c64] : !stream.resource{%c128} -> !stream.resource{%c8} + %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %6, %7 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c64_i32 = arith.constant 64 : i32 + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c128} => !stream.timepoint + %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}, %result as %arg4: !stream.resource{%c128}) { + stream.cmd.concurrent { + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32, %c0_i32, %c0_i32, %c0_i32 : i32, i32, i32, i32) { + ro %arg2[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c128] : !stream.resource{%c128} + } + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32, %c0_i32, %c64_i32, %c0_i32 : i32, i32, i32, i32) { + ro %arg3[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c128] : !stream.resource{%c128} + } + } + } => !stream.timepoint + %3 = stream.timepoint.await %2 => %result : !stream.resource{%c128} + %4 = stream.resource.subview %3[%c0] : !stream.resource{%c128} -> !stream.resource{%c8} + %5 = stream.resource.subview %3[%c64] : !stream.resource{%c128} -> !stream.resource{%c8} + %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %6, %7 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c64_i32 = arith.constant 64 : i32 + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c128} => !stream.timepoint + %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}, %result as %arg4: !stream.resource{%c128}) { + stream.cmd.concurrent { + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32, %c0_i32, %c0_i32, %c0_i32 : i32, i32, i32, i32) { + ro %arg2[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c128] : !stream.resource{%c128} + } + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32, %c0_i32, %c64_i32, %c0_i32 : i32, i32, i32, i32) { + ro %arg3[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c128] : !stream.resource{%c128} + } + } + } => !stream.timepoint + %3 = stream.timepoint.await %2 => %result : !stream.resource{%c128} + %4 = stream.resource.subview %3[%c0] : !stream.resource{%c128} -> !stream.resource{%c8} + %5 = stream.resource.subview %3[%c64] : !stream.resource{%c128} -> !stream.resource{%c8} + %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %6, %7 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + stream.executable private @multiple_results_dispatch_0 { + stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + stream.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32) { + %c0 = arith.constant 0 : index + %c32_i64 = arith.constant 32 : i64 + %0 = arith.extui %arg4 : i32 to i64 + %1 = arith.extui %arg5 : i32 to i64 + %2 = arith.shli %1, %c32_i64 : i64 + %3 = arith.ori %0, %2 : i64 + %4 = arith.index_castui %3 {stream.alignment = 64 : index, stream.values = [0 : index, 64 : index]} : i64 to index + %5 = util.assume.int %4[, ] : index + %6 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %7 = stream.binding.subspan %arg1[%5] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %8 = iree_tensor_ext.dispatch.tensor.load %6, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %9 = tensor.empty() : tensor<2xf32> + %10 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%8 : tensor<2xf32>) outs(%9 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %11 = math.absf %in : f32 + linalg.yield %11 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %10, %7, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c64_i32 = arith.constant 64 : i32 + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c128} => !stream.timepoint + %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}, %result as %arg4: !stream.resource{%c128}) { + stream.cmd.concurrent { + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32, %c0_i32, %c0_i32, %c0_i32 : i32, i32, i32, i32) { + ro %arg2[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c128] : !stream.resource{%c128} + } + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32, %c0_i32, %c64_i32, %c0_i32 : i32, i32, i32, i32) { + ro %arg3[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c128] : !stream.resource{%c128} + } + } + } => !stream.timepoint + %3 = stream.timepoint.await %2 => %result : !stream.resource{%c128} + %4 = stream.resource.subview %3[%c0] : !stream.resource{%c128} -> !stream.resource{%c8} + %5 = stream.resource.subview %3[%c64] : !stream.resource{%c128} -> !stream.resource{%c8} + %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %6, %7 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + stream.executable private @multiple_results_dispatch_0 { + stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + stream.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32) { + %c0 = arith.constant 0 : index + %c32_i64 = arith.constant 32 : i64 + %0 = arith.extui %arg4 : i32 to i64 + %1 = arith.extui %arg5 : i32 to i64 + %2 = arith.shli %1, %c32_i64 : i64 + %3 = arith.ori %0, %2 : i64 + %4 = arith.index_castui %3 {stream.alignment = 64 : index, stream.values = [0 : index, 64 : index]} : i64 to index + %5 = util.assume.int %4[, ] : index + %6 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %7 = stream.binding.subspan %arg1[%5] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %8 = iree_tensor_ext.dispatch.tensor.load %6, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %9 = tensor.empty() : tensor<2xf32> + %10 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%8 : tensor<2xf32>) outs(%9 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %11 = math.absf %in : f32 + linalg.yield %11 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %10, %7, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c64_i32 = arith.constant 64 : i32 + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c128} => !stream.timepoint + %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}, %result as %arg4: !stream.resource{%c128}) { + stream.cmd.concurrent { + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32, %c0_i32, %c0_i32, %c0_i32 : i32, i32, i32, i32) { + ro %arg2[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c128] : !stream.resource{%c128} + } + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32, %c0_i32, %c64_i32, %c0_i32 : i32, i32, i32, i32) { + ro %arg3[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c128] : !stream.resource{%c128} + } + } + } => !stream.timepoint + %3 = stream.timepoint.await %2 => %result : !stream.resource{%c128} + %4 = stream.resource.subview %3[%c0] : !stream.resource{%c128} -> !stream.resource{%c8} + %5 = stream.resource.subview %3[%c64] : !stream.resource{%c128} -> !stream.resource{%c8} + %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %6, %7 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After IPOPass (iree-util-ipo) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + stream.executable private @multiple_results_dispatch_0 { + stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + stream.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32) { + %c0 = arith.constant 0 : index + %c32_i64 = arith.constant 32 : i64 + %0 = arith.extui %arg4 : i32 to i64 + %1 = arith.extui %arg5 : i32 to i64 + %2 = arith.shli %1, %c32_i64 : i64 + %3 = arith.ori %0, %2 : i64 + %4 = arith.index_castui %3 {stream.alignment = 64 : index, stream.values = [0 : index, 64 : index]} : i64 to index + %5 = util.assume.int %4[, ] : index + %6 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %7 = stream.binding.subspan %arg1[%5] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %8 = iree_tensor_ext.dispatch.tensor.load %6, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %9 = tensor.empty() : tensor<2xf32> + %10 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%8 : tensor<2xf32>) outs(%9 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %11 = math.absf %in : f32 + linalg.yield %11 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %10, %7, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c64_i32 = arith.constant 64 : i32 + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c128} => !stream.timepoint + %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}, %result as %arg4: !stream.resource{%c128}) { + stream.cmd.concurrent { + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32, %c0_i32, %c0_i32, %c0_i32 : i32, i32, i32, i32) { + ro %arg2[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c128] : !stream.resource{%c128} + } + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32, %c0_i32, %c64_i32, %c0_i32 : i32, i32, i32, i32) { + ro %arg3[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c128] : !stream.resource{%c128} + } + } + } => !stream.timepoint + %3 = stream.timepoint.await %2 => %result : !stream.resource{%c128} + %4 = stream.resource.subview %3[%c0] : !stream.resource{%c128} -> !stream.resource{%c8} + %5 = stream.resource.subview %3[%c64] : !stream.resource{%c128} -> !stream.resource{%c8} + %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %6, %7 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After FoldUniformOperandsPass (iree-stream-fold-uniform-operands) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + stream.executable private @multiple_results_dispatch_0 { + stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + stream.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: i32) { + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c32_i64 = arith.constant 32 : i64 + %0 = arith.extui %arg2 : i32 to i64 + %1 = arith.extui %c0_i32 : i32 to i64 + %2 = arith.shli %1, %c32_i64 : i64 + %3 = arith.ori %0, %2 : i64 + %4 = arith.index_castui %3 {stream.alignment = 64 : index, stream.values = [0 : index, 64 : index]} : i64 to index + %5 = util.assume.int %4[, ] : index + %6 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %7 = stream.binding.subspan %arg1[%5] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %8 = iree_tensor_ext.dispatch.tensor.load %6, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %9 = tensor.empty() : tensor<2xf32> + %10 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%8 : tensor<2xf32>) outs(%9 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %11 = math.absf %in : f32 + linalg.yield %11 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %10, %7, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c64_i32 = arith.constant 64 : i32 + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c128} => !stream.timepoint + %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}, %result as %arg4: !stream.resource{%c128}) { + stream.cmd.concurrent { + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32 : i32) { + ro %arg2[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c128] : !stream.resource{%c128} + } + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c64_i32 : i32) { + ro %arg3[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c128] : !stream.resource{%c128} + } + } + } => !stream.timepoint + %3 = stream.timepoint.await %2 => %result : !stream.resource{%c128} + %4 = stream.resource.subview %3[%c0] : !stream.resource{%c128} -> !stream.resource{%c8} + %5 = stream.resource.subview %3[%c64] : !stream.resource{%c128} -> !stream.resource{%c8} + %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %6, %7 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After Canonicalizer (canonicalize) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c64_i32 = arith.constant 64 : i32 + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c128} => !stream.timepoint + %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}, %result as %arg4: !stream.resource{%c128}) { + stream.cmd.concurrent { + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32 : i32) { + ro %arg2[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c128] : !stream.resource{%c128} + } + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c64_i32 : i32) { + ro %arg3[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c128] : !stream.resource{%c128} + } + } + } => !stream.timepoint + %3 = stream.timepoint.await %2 => %result : !stream.resource{%c128} + %4 = stream.resource.subview %3[%c0] : !stream.resource{%c128} -> !stream.resource{%c8} + %5 = stream.resource.subview %3[%c64] : !stream.resource{%c128} -> !stream.resource{%c8} + %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %6, %7 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After CSE (cse) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c64_i32 = arith.constant 64 : i32 + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c128} => !stream.timepoint + %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}, %result as %arg4: !stream.resource{%c128}) { + stream.cmd.concurrent { + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32 : i32) { + ro %arg2[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c128] : !stream.resource{%c128} + } + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c64_i32 : i32) { + ro %arg3[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c128] : !stream.resource{%c128} + } + } + } => !stream.timepoint + %3 = stream.timepoint.await %2 => %result : !stream.resource{%c128} + %4 = stream.resource.subview %3[%c0] : !stream.resource{%c128} -> !stream.resource{%c8} + %5 = stream.resource.subview %3[%c64] : !stream.resource{%c128} -> !stream.resource{%c8} + %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %6, %7 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c64_i32 = arith.constant 64 : i32 + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c128} => !stream.timepoint + %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}, %result as %arg4: !stream.resource{%c128}) { + stream.cmd.concurrent { + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32 : i32) { + ro %arg2[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c128] : !stream.resource{%c128} + } + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c64_i32 : i32) { + ro %arg3[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c128] : !stream.resource{%c128} + } + } + } => !stream.timepoint + %3 = stream.timepoint.await %2 => %result : !stream.resource{%c128} + %4 = stream.resource.subview %3[%c0] : !stream.resource{%c128} -> !stream.resource{%c8} + %5 = stream.resource.subview %3[%c64] : !stream.resource{%c128} -> !stream.resource{%c8} + %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %6, %7 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c64_i32 = arith.constant 64 : i32 + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c128} => !stream.timepoint + %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}, %result as %arg4: !stream.resource{%c128}) { + stream.cmd.concurrent { + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32 : i32) { + ro %arg2[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c128] : !stream.resource{%c128} + } + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c64_i32 : i32) { + ro %arg3[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c128] : !stream.resource{%c128} + } + } + } => !stream.timepoint + %3 = stream.timepoint.await %2 => %result : !stream.resource{%c128} + %4 = stream.resource.subview %3[%c0] : !stream.resource{%c128} -> !stream.resource{%c8} + %5 = stream.resource.subview %3[%c64] : !stream.resource{%c128} -> !stream.resource{%c8} + %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %6, %7 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c64_i32 = arith.constant 64 : i32 + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c128} => !stream.timepoint + %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}, %result as %arg4: !stream.resource{%c128}) { + stream.cmd.concurrent { + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32 : i32) { + ro %arg2[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c128] : !stream.resource{%c128} + } + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c64_i32 : i32) { + ro %arg3[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c128] : !stream.resource{%c128} + } + } + } => !stream.timepoint + %3 = stream.timepoint.await %2 => %result : !stream.resource{%c128} + %4 = stream.resource.subview %3[%c0] : !stream.resource{%c128} -> !stream.resource{%c8} + %5 = stream.resource.subview %3[%c64] : !stream.resource{%c128} -> !stream.resource{%c8} + %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %6, %7 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + stream.executable private @multiple_results_dispatch_0 { + stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + stream.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: i32) { + %c0 = arith.constant 0 : index + %0 = arith.index_castui %arg2 : i32 to index + %1 = util.assume.int %0[, ] : index + %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %3 = stream.binding.subspan %arg1[%1] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %4 = iree_tensor_ext.dispatch.tensor.load %2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %5 = tensor.empty() : tensor<2xf32> + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%4 : tensor<2xf32>) outs(%5 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %7 = math.absf %in : f32 + linalg.yield %7 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %6, %3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c64_i32 = arith.constant 64 : i32 + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c128} => !stream.timepoint + %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}, %result as %arg4: !stream.resource{%c128}) { + stream.cmd.concurrent { + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32 : i32) { + ro %arg2[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c128] : !stream.resource{%c128} + } + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c64_i32 : i32) { + ro %arg3[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c128] : !stream.resource{%c128} + } + } + } => !stream.timepoint + %3 = stream.timepoint.await %2 => %result : !stream.resource{%c128} + %4 = stream.resource.subview %3[%c0] : !stream.resource{%c128} -> !stream.resource{%c8} + %5 = stream.resource.subview %3[%c64] : !stream.resource{%c128} -> !stream.resource{%c8} + %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %6, %7 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + stream.executable private @multiple_results_dispatch_0 { + stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + stream.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: i32) { + %c0 = arith.constant 0 : index + %0 = arith.index_castui %arg2 : i32 to index + %1 = util.assume.int %0[, ] : index + %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %3 = stream.binding.subspan %arg1[%1] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %4 = iree_tensor_ext.dispatch.tensor.load %2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %5 = tensor.empty() : tensor<2xf32> + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%4 : tensor<2xf32>) outs(%5 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %7 = math.absf %in : f32 + linalg.yield %7 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %6, %3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c64_i32 = arith.constant 64 : i32 + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c128} => !stream.timepoint + %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}, %result as %arg4: !stream.resource{%c128}) { + stream.cmd.concurrent { + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32 : i32) { + ro %arg2[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c128] : !stream.resource{%c128} + } + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c64_i32 : i32) { + ro %arg3[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c128] : !stream.resource{%c128} + } + } + } => !stream.timepoint + %3 = stream.timepoint.await %2 => %result : !stream.resource{%c128} + %4 = stream.resource.subview %3[%c0] : !stream.resource{%c128} -> !stream.resource{%c8} + %5 = stream.resource.subview %3[%c64] : !stream.resource{%c128} -> !stream.resource{%c8} + %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %6, %7 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After IPOPass (iree-util-ipo) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + stream.executable private @multiple_results_dispatch_0 { + stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + stream.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: i32) { + %c0 = arith.constant 0 : index + %0 = arith.index_castui %arg2 : i32 to index + %1 = util.assume.int %0[, ] : index + %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %3 = stream.binding.subspan %arg1[%1] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %4 = iree_tensor_ext.dispatch.tensor.load %2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %5 = tensor.empty() : tensor<2xf32> + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%4 : tensor<2xf32>) outs(%5 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %7 = math.absf %in : f32 + linalg.yield %7 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %6, %3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c64_i32 = arith.constant 64 : i32 + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c128} => !stream.timepoint + %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}, %result as %arg4: !stream.resource{%c128}) { + stream.cmd.concurrent { + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32 : i32) { + ro %arg2[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c128] : !stream.resource{%c128} + } + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c64_i32 : i32) { + ro %arg3[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c128] : !stream.resource{%c128} + } + } + } => !stream.timepoint + %3 = stream.timepoint.await %2 => %result : !stream.resource{%c128} + %4 = stream.resource.subview %3[%c0] : !stream.resource{%c128} -> !stream.resource{%c8} + %5 = stream.resource.subview %3[%c64] : !stream.resource{%c128} -> !stream.resource{%c8} + %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %6, %7 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After SymbolDCE (symbol-dce) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + stream.executable private @multiple_results_dispatch_0 { + stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + stream.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: i32) { + %c0 = arith.constant 0 : index + %0 = arith.index_castui %arg2 : i32 to index + %1 = util.assume.int %0[, ] : index + %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %3 = stream.binding.subspan %arg1[%1] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %4 = iree_tensor_ext.dispatch.tensor.load %2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %5 = tensor.empty() : tensor<2xf32> + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%4 : tensor<2xf32>) outs(%5 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %7 = math.absf %in : f32 + linalg.yield %7 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %6, %3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c64_i32 = arith.constant 64 : i32 + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c128} => !stream.timepoint + %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}, %result as %arg4: !stream.resource{%c128}) { + stream.cmd.concurrent { + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32 : i32) { + ro %arg2[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c128] : !stream.resource{%c128} + } + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c64_i32 : i32) { + ro %arg3[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c128] : !stream.resource{%c128} + } + } + } => !stream.timepoint + %3 = stream.timepoint.await %2 => %result : !stream.resource{%c128} + %4 = stream.resource.subview %3[%c0] : !stream.resource{%c128} -> !stream.resource{%c8} + %5 = stream.resource.subview %3[%c64] : !stream.resource{%c128} -> !stream.resource{%c8} + %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %6, %7 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After VerifyInitializationOrderPass (iree-util-verify-initialization-order) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + stream.executable private @multiple_results_dispatch_0 { + stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + stream.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: i32) { + %c0 = arith.constant 0 : index + %0 = arith.index_castui %arg2 : i32 to index + %1 = util.assume.int %0[, ] : index + %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %3 = stream.binding.subspan %arg1[%1] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %4 = iree_tensor_ext.dispatch.tensor.load %2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %5 = tensor.empty() : tensor<2xf32> + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%4 : tensor<2xf32>) outs(%5 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %7 = math.absf %in : f32 + linalg.yield %7 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %6, %3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c64_i32 = arith.constant 64 : i32 + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c128} => !stream.timepoint + %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}, %result as %arg4: !stream.resource{%c128}) { + stream.cmd.concurrent { + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32 : i32) { + ro %arg2[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c128] : !stream.resource{%c128} + } + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c64_i32 : i32) { + ro %arg3[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c128] : !stream.resource{%c128} + } + } + } => !stream.timepoint + %3 = stream.timepoint.await %2 => %result : !stream.resource{%c128} + %4 = stream.resource.subview %3[%c0] : !stream.resource{%c128} -> !stream.resource{%c8} + %5 = stream.resource.subview %3[%c64] : !stream.resource{%c128} -> !stream.resource{%c8} + %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %6, %7 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After AttributeCallGraphPass (iree-util-attribute-call-graph) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + stream.executable private @multiple_results_dispatch_0 { + stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + stream.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: i32) { + %c0 = arith.constant 0 : index + %0 = arith.index_castui %arg2 : i32 to index + %1 = util.assume.int %0[, ] : index + %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %3 = stream.binding.subspan %arg1[%1] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %4 = iree_tensor_ext.dispatch.tensor.load %2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %5 = tensor.empty() : tensor<2xf32> + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%4 : tensor<2xf32>) outs(%5 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %7 = math.absf %in : f32 + linalg.yield %7 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %6, %3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c64_i32 = arith.constant 64 : i32 + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c128} => !stream.timepoint + %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}, %result as %arg4: !stream.resource{%c128}) { + stream.cmd.concurrent { + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32 : i32) { + ro %arg2[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c128] : !stream.resource{%c128} + } + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c64_i32 : i32) { + ro %arg3[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c128] : !stream.resource{%c128} + } + } + } => !stream.timepoint + %3 = stream.timepoint.await %2 => %result : !stream.resource{%c128} + %4 = stream.resource.subview %3[%c0] : !stream.resource{%c128} -> !stream.resource{%c8} + %5 = stream.resource.subview %3[%c64] : !stream.resource{%c128} -> !stream.resource{%c8} + %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %6, %7 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After AssignLegacyTargetDevicesPass (iree-hal-assign-legacy-target-devices) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + stream.executable private @multiple_results_dispatch_0 { + stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + stream.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: i32) { + %c0 = arith.constant 0 : index + %0 = arith.index_castui %arg2 : i32 to index + %1 = util.assume.int %0[, ] : index + %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %3 = stream.binding.subspan %arg1[%1] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %4 = iree_tensor_ext.dispatch.tensor.load %2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %5 = tensor.empty() : tensor<2xf32> + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%4 : tensor<2xf32>) outs(%5 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %7 = math.absf %in : f32 + linalg.yield %7 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %6, %3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c64_i32 = arith.constant 64 : i32 + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c128} => !stream.timepoint + %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}, %result as %arg4: !stream.resource{%c128}) { + stream.cmd.concurrent { + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32 : i32) { + ro %arg2[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c128] : !stream.resource{%c128} + } + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c64_i32 : i32) { + ro %arg3[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c128] : !stream.resource{%c128} + } + } + } => !stream.timepoint + %3 = stream.timepoint.await %2 => %result : !stream.resource{%c128} + %4 = stream.resource.subview %3[%c0] : !stream.resource{%c128} -> !stream.resource{%c8} + %5 = stream.resource.subview %3[%c64] : !stream.resource{%c128} -> !stream.resource{%c8} + %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %6, %7 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After MaterializeTargetDevicesPass (iree-hal-materialize-target-devices) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + stream.executable private @multiple_results_dispatch_0 { + stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + stream.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: i32) { + %c0 = arith.constant 0 : index + %0 = arith.index_castui %arg2 : i32 to index + %1 = util.assume.int %0[, ] : index + %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %3 = stream.binding.subspan %arg1[%1] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %4 = iree_tensor_ext.dispatch.tensor.load %2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %5 = tensor.empty() : tensor<2xf32> + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%4 : tensor<2xf32>) outs(%5 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %7 = math.absf %in : f32 + linalg.yield %7 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %6, %3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c64_i32 = arith.constant 64 : i32 + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c128} => !stream.timepoint + %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}, %result as %arg4: !stream.resource{%c128}) { + stream.cmd.concurrent { + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32 : i32) { + ro %arg2[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c128] : !stream.resource{%c128} + } + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c64_i32 : i32) { + ro %arg3[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c128] : !stream.resource{%c128} + } + } + } => !stream.timepoint + %3 = stream.timepoint.await %2 => %result : !stream.resource{%c128} + %4 = stream.resource.subview %3[%c0] : !stream.resource{%c128} -> !stream.resource{%c8} + %5 = stream.resource.subview %3[%c64] : !stream.resource{%c128} -> !stream.resource{%c8} + %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %6, %7 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After ResolveDevicePromisesPass (iree-hal-resolve-device-promises) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + stream.executable private @multiple_results_dispatch_0 { + stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + stream.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: i32) { + %c0 = arith.constant 0 : index + %0 = arith.index_castui %arg2 : i32 to index + %1 = util.assume.int %0[, ] : index + %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %3 = stream.binding.subspan %arg1[%1] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %4 = iree_tensor_ext.dispatch.tensor.load %2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %5 = tensor.empty() : tensor<2xf32> + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%4 : tensor<2xf32>) outs(%5 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %7 = math.absf %in : f32 + linalg.yield %7 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %6, %3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c64_i32 = arith.constant 64 : i32 + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c128} => !stream.timepoint + %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}, %result as %arg4: !stream.resource{%c128}) { + stream.cmd.concurrent { + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32 : i32) { + ro %arg2[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c128] : !stream.resource{%c128} + } + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c64_i32 : i32) { + ro %arg3[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c128] : !stream.resource{%c128} + } + } + } => !stream.timepoint + %3 = stream.timepoint.await %2 => %result : !stream.resource{%c128} + %4 = stream.resource.subview %3[%c0] : !stream.resource{%c128} -> !stream.resource{%c8} + %5 = stream.resource.subview %3[%c64] : !stream.resource{%c128} -> !stream.resource{%c8} + %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %6, %7 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After ResolveDeviceAliasesPass (iree-hal-resolve-device-aliases) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + stream.executable private @multiple_results_dispatch_0 { + stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + stream.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: i32) { + %c0 = arith.constant 0 : index + %0 = arith.index_castui %arg2 : i32 to index + %1 = util.assume.int %0[, ] : index + %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %3 = stream.binding.subspan %arg1[%1] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %4 = iree_tensor_ext.dispatch.tensor.load %2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %5 = tensor.empty() : tensor<2xf32> + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%4 : tensor<2xf32>) outs(%5 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %7 = math.absf %in : f32 + linalg.yield %7 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %6, %3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c64_i32 = arith.constant 64 : i32 + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c128} => !stream.timepoint + %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}, %result as %arg4: !stream.resource{%c128}) { + stream.cmd.concurrent { + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32 : i32) { + ro %arg2[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c128] : !stream.resource{%c128} + } + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c64_i32 : i32) { + ro %arg3[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c128] : !stream.resource{%c128} + } + } + } => !stream.timepoint + %3 = stream.timepoint.await %2 => %result : !stream.resource{%c128} + %4 = stream.resource.subview %3[%c0] : !stream.resource{%c128} -> !stream.resource{%c8} + %5 = stream.resource.subview %3[%c64] : !stream.resource{%c128} -> !stream.resource{%c8} + %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %6, %7 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After VerifyDevicesPass (iree-hal-verify-devices) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + stream.executable private @multiple_results_dispatch_0 { + stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + stream.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: i32) { + %c0 = arith.constant 0 : index + %0 = arith.index_castui %arg2 : i32 to index + %1 = util.assume.int %0[, ] : index + %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %3 = stream.binding.subspan %arg1[%1] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %4 = iree_tensor_ext.dispatch.tensor.load %2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %5 = tensor.empty() : tensor<2xf32> + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%4 : tensor<2xf32>) outs(%5 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %7 = math.absf %in : f32 + linalg.yield %7 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %6, %3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c64_i32 = arith.constant 64 : i32 + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c128} => !stream.timepoint + %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}, %result as %arg4: !stream.resource{%c128}) { + stream.cmd.concurrent { + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32 : i32) { + ro %arg2[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c128] : !stream.resource{%c128} + } + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c64_i32 : i32) { + ro %arg3[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c128] : !stream.resource{%c128} + } + } + } => !stream.timepoint + %3 = stream.timepoint.await %2 => %result : !stream.resource{%c128} + %4 = stream.resource.subview %3[%c0] : !stream.resource{%c128} -> !stream.resource{%c8} + %5 = stream.resource.subview %3[%c64] : !stream.resource{%c128} -> !stream.resource{%c8} + %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %6, %7 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After Canonicalizer (canonicalize) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c64_i32 = arith.constant 64 : i32 + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c128} => !stream.timepoint + %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}, %result as %arg4: !stream.resource{%c128}) { + stream.cmd.concurrent { + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32 : i32) { + ro %arg2[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c128] : !stream.resource{%c128} + } + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c64_i32 : i32) { + ro %arg3[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c128] : !stream.resource{%c128} + } + } + } => !stream.timepoint + %3 = stream.timepoint.await %2 => %result : !stream.resource{%c128} + %4 = stream.resource.subview %3[%c0] : !stream.resource{%c128} -> !stream.resource{%c8} + %5 = stream.resource.subview %3[%c64] : !stream.resource{%c128} -> !stream.resource{%c8} + %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %6, %7 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After CSE (cse) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c64_i32 = arith.constant 64 : i32 + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c128} => !stream.timepoint + %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}, %result as %arg4: !stream.resource{%c128}) { + stream.cmd.concurrent { + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32 : i32) { + ro %arg2[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c128] : !stream.resource{%c128} + } + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c64_i32 : i32) { + ro %arg3[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c128] : !stream.resource{%c128} + } + } + } => !stream.timepoint + %3 = stream.timepoint.await %2 => %result : !stream.resource{%c128} + %4 = stream.resource.subview %3[%c0] : !stream.resource{%c128} -> !stream.resource{%c8} + %5 = stream.resource.subview %3[%c64] : !stream.resource{%c128} -> !stream.resource{%c8} + %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %6, %7 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c64_i32 = arith.constant 64 : i32 + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c128} => !stream.timepoint + %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}, %result as %arg4: !stream.resource{%c128}) { + stream.cmd.concurrent { + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32 : i32) { + ro %arg2[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c128] : !stream.resource{%c128} + } + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c64_i32 : i32) { + ro %arg3[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c128] : !stream.resource{%c128} + } + } + } => !stream.timepoint + %3 = stream.timepoint.await %2 => %result : !stream.resource{%c128} + %4 = stream.resource.subview %3[%c0] : !stream.resource{%c128} -> !stream.resource{%c8} + %5 = stream.resource.subview %3[%c64] : !stream.resource{%c128} -> !stream.resource{%c8} + %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %6, %7 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c64_i32 = arith.constant 64 : i32 + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c128} => !stream.timepoint + %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}, %result as %arg4: !stream.resource{%c128}) { + stream.cmd.concurrent { + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32 : i32) { + ro %arg2[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c128] : !stream.resource{%c128} + } + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c64_i32 : i32) { + ro %arg3[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c128] : !stream.resource{%c128} + } + } + } => !stream.timepoint + %3 = stream.timepoint.await %2 => %result : !stream.resource{%c128} + %4 = stream.resource.subview %3[%c0] : !stream.resource{%c128} -> !stream.resource{%c8} + %5 = stream.resource.subview %3[%c64] : !stream.resource{%c128} -> !stream.resource{%c8} + %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %6, %7 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + stream.executable private @multiple_results_dispatch_0 { + stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + stream.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: i32) { + %c0 = arith.constant 0 : index + %0 = arith.index_castui %arg2 : i32 to index + %1 = util.assume.int %0[, ] : index + %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %3 = stream.binding.subspan %arg1[%1] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %4 = iree_tensor_ext.dispatch.tensor.load %2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %5 = tensor.empty() : tensor<2xf32> + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%4 : tensor<2xf32>) outs(%5 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %7 = math.absf %in : f32 + linalg.yield %7 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %6, %3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c64_i32 = arith.constant 64 : i32 + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c128} => !stream.timepoint + %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}, %result as %arg4: !stream.resource{%c128}) { + stream.cmd.concurrent { + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32 : i32) { + ro %arg2[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c128] : !stream.resource{%c128} + } + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c64_i32 : i32) { + ro %arg3[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c128] : !stream.resource{%c128} + } + } + } => !stream.timepoint + %3 = stream.timepoint.await %2 => %result : !stream.resource{%c128} + %4 = stream.resource.subview %3[%c0] : !stream.resource{%c128} -> !stream.resource{%c8} + %5 = stream.resource.subview %3[%c64] : !stream.resource{%c128} -> !stream.resource{%c8} + %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %6, %7 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + stream.executable private @multiple_results_dispatch_0 { + stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + stream.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: i32) { + %c0 = arith.constant 0 : index + %0 = arith.index_castui %arg2 : i32 to index + %1 = util.assume.int %0[, ] : index + %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %3 = stream.binding.subspan %arg1[%1] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %4 = iree_tensor_ext.dispatch.tensor.load %2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %5 = tensor.empty() : tensor<2xf32> + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%4 : tensor<2xf32>) outs(%5 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %7 = math.absf %in : f32 + linalg.yield %7 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %6, %3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c64_i32 = arith.constant 64 : i32 + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c128} => !stream.timepoint + %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}, %result as %arg4: !stream.resource{%c128}) { + stream.cmd.concurrent { + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32 : i32) { + ro %arg2[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c128] : !stream.resource{%c128} + } + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c64_i32 : i32) { + ro %arg3[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c128] : !stream.resource{%c128} + } + } + } => !stream.timepoint + %3 = stream.timepoint.await %2 => %result : !stream.resource{%c128} + %4 = stream.resource.subview %3[%c0] : !stream.resource{%c128} -> !stream.resource{%c8} + %5 = stream.resource.subview %3[%c64] : !stream.resource{%c128} -> !stream.resource{%c8} + %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %6, %7 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After VerifyDevicesPass (iree-hal-verify-devices) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + stream.executable private @multiple_results_dispatch_0 { + stream.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 workgroups() -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + stream.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: i32) { + %c0 = arith.constant 0 : index + %0 = arith.index_castui %arg2 : i32 to index + %1 = util.assume.int %0[, ] : index + %2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %3 = stream.binding.subspan %arg1[%1] : !stream.binding -> !iree_tensor_ext.dispatch.tensor> + %4 = iree_tensor_ext.dispatch.tensor.load %2, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %5 = tensor.empty() : tensor<2xf32> + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%4 : tensor<2xf32>) outs(%5 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %7 = math.absf %in : f32 + linalg.yield %7 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %6, %3, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c64_i32 = arith.constant 64 : i32 + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c128} => !stream.timepoint + %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}, %result as %arg4: !stream.resource{%c128}) { + stream.cmd.concurrent { + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32 : i32) { + ro %arg2[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c128] : !stream.resource{%c128} + } + stream.cmd.dispatch @multiple_results_dispatch_0::@multiple_results_dispatch_0_elementwise_2_f32(%c64_i32 : i32) { + ro %arg3[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c128] : !stream.resource{%c128} + } + } + } => !stream.timepoint + %3 = stream.timepoint.await %2 => %result : !stream.resource{%c128} + %4 = stream.resource.subview %3[%c0] : !stream.resource{%c128} -> !stream.resource{%c8} + %5 = stream.resource.subview %3[%c64] : !stream.resource{%c128} -> !stream.resource{%c8} + %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %6, %7 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After MaterializeInterfacesPass (iree-hal-materialize-interfaces) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding], flags = Indirect> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + hal.executable private @multiple_results_dispatch_0 { + hal.executable.variant public @embedded_elf_arm_64 target(#executable_target_embedded_elf_arm_64) { + hal.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 ordinal(0) layout(#pipeline_layout) count(%arg0: !hal.device) -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + hal.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32() { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 + %1 = arith.index_castui %0 : i32 to index + %2 = util.assume.int %1[, ] : index + %3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor> + %4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !iree_tensor_ext.dispatch.tensor> + %5 = iree_tensor_ext.dispatch.tensor.load %3, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %6 = tensor.empty() : tensor<2xf32> + %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%5 : tensor<2xf32>) outs(%6 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %8 = math.absf %in : f32 + linalg.yield %8 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %7, %4, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c64_i32 = arith.constant 64 : i32 + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c128} => !stream.timepoint + %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}, %result as %arg4: !stream.resource{%c128}) { + stream.cmd.concurrent { + stream.cmd.dispatch @multiple_results_dispatch_0::@embedded_elf_arm_64::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32 : i32) { + ro %arg2[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c128] : !stream.resource{%c128} + } + stream.cmd.dispatch @multiple_results_dispatch_0::@embedded_elf_arm_64::@multiple_results_dispatch_0_elementwise_2_f32(%c64_i32 : i32) { + ro %arg3[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c128] : !stream.resource{%c128} + } + } + } => !stream.timepoint + %3 = stream.timepoint.await %2 => %result : !stream.resource{%c128} + %4 = stream.resource.subview %3[%c0] : !stream.resource{%c128} -> !stream.resource{%c8} + %5 = stream.resource.subview %3[%c64] : !stream.resource{%c128} -> !stream.resource{%c8} + %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %6, %7 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After PruneExecutablesPass (iree-hal-prune-executables) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#map = affine_map<(d0) -> (d0)> +#pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding], flags = Indirect> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { + util.global private @__device_0 = #device_target_local + hal.executable private @multiple_results_dispatch_0 { + hal.executable.variant public @embedded_elf_arm_64 target(#executable_target_embedded_elf_arm_64) { + hal.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 ordinal(0) layout(#pipeline_layout) count(%arg0: !hal.device) -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + hal.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32() { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 + %1 = arith.index_castui %0 : i32 to index + %2 = util.assume.int %1[, ] : index + %3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor> + %4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !iree_tensor_ext.dispatch.tensor> + %5 = iree_tensor_ext.dispatch.tensor.load %3, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %6 = tensor.empty() : tensor<2xf32> + %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%5 : tensor<2xf32>) outs(%6 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %8 = math.absf %in : f32 + linalg.yield %8 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %7, %4, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c64_i32 = arith.constant 64 : i32 + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2xf32> in !stream.resource{%c8} + %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource{%c128} => !stream.timepoint + %2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg2: !stream.resource{%c8}, %1 as %arg3: !stream.resource{%c8}, %result as %arg4: !stream.resource{%c128}) { + stream.cmd.concurrent { + stream.cmd.dispatch @multiple_results_dispatch_0::@embedded_elf_arm_64::@multiple_results_dispatch_0_elementwise_2_f32(%c0_i32 : i32) { + ro %arg2[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c128] : !stream.resource{%c128} + } + stream.cmd.dispatch @multiple_results_dispatch_0::@embedded_elf_arm_64::@multiple_results_dispatch_0_elementwise_2_f32(%c64_i32 : i32) { + ro %arg3[%c0 for %c8] : !stream.resource{%c8}, + wo %arg4[%c0 for %c128] : !stream.resource{%c128} + } + } + } => !stream.timepoint + %3 = stream.timepoint.await %2 => %result : !stream.resource{%c128} + %4 = stream.resource.subview %3[%c0] : !stream.resource{%c128} -> !stream.resource{%c8} + %5 = stream.resource.subview %3[%c64] : !stream.resource{%c128} -> !stream.resource{%c8} + %6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<2xf32> in !stream.resource{%c8} -> !hal.buffer_view + util.return %6, %7 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After SpecializeExportsPass (iree-codegen-specialize-exports) //----- // +hal.executable.variant public @embedded_elf_arm_64 target(<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>) { + hal.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 ordinal(0) layout(#hal.pipeline.layout, #hal.pipeline.binding], flags = Indirect>) count(%arg0: !hal.device) -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + hal.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32() { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %1 = arith.index_castui %0 : i32 to index + %2 = util.assume.int %1[, ] : index + %3 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor> + %4 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : !iree_tensor_ext.dispatch.tensor> + %5 = iree_tensor_ext.dispatch.tensor.load %3, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %6 = tensor.empty() : tensor<2xf32> + %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%5 : tensor<2xf32>) outs(%6 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %8 = math.absf %in : f32 + linalg.yield %8 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %7, %4, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } +} + +// -----// IR Dump After TypePropagationPass (iree-codegen-type-propagation) //----- // +func.func @multiple_results_dispatch_0_elementwise_2_f32() { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %1 = arith.index_castui %0 : i32 to index + %2 = util.assume.int %1[, ] : index + %3 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor> + %4 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : !iree_tensor_ext.dispatch.tensor> + %5 = iree_tensor_ext.dispatch.tensor.load %3, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %6 = tensor.empty() : tensor<2xf32> + %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%5 : tensor<2xf32>) outs(%6 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %8 = math.absf %in : f32 + linalg.yield %8 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %7, %4, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return +} + +// -----// IR Dump After BubbleUpOrdinalOpsPass (iree-codegen-bubble-up-ordinal-ops) //----- // +func.func @multiple_results_dispatch_0_elementwise_2_f32() { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %1 = arith.index_castui %0 : i32 to index + %2 = util.assume.int %1[, ] : index + %3 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor> + %4 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : !iree_tensor_ext.dispatch.tensor> + %5 = iree_tensor_ext.dispatch.tensor.load %3, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %6 = tensor.empty() : tensor<2xf32> + %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%5 : tensor<2xf32>) outs(%6 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %8 = math.absf %in : f32 + linalg.yield %8 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %7, %4, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return +} + +// -----// IR Dump After BufferizeCopyOnlyDispatchesPass (iree-codegen-bufferize-copy-only-dispatches) //----- // +func.func @multiple_results_dispatch_0_elementwise_2_f32() { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %1 = arith.index_castui %0 : i32 to index + %2 = util.assume.int %1[, ] : index + %3 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor> + %4 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : !iree_tensor_ext.dispatch.tensor> + %5 = iree_tensor_ext.dispatch.tensor.load %3, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %6 = tensor.empty() : tensor<2xf32> + %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%5 : tensor<2xf32>) outs(%6 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %8 = math.absf %in : f32 + linalg.yield %8 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %7, %4, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return +} + +// -----// IR Dump After DecomposeSoftmaxPass (iree-codegen-decompose-softmax) //----- // +func.func @multiple_results_dispatch_0_elementwise_2_f32() { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %1 = arith.index_castui %0 : i32 to index + %2 = util.assume.int %1[, ] : index + %3 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor> + %4 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : !iree_tensor_ext.dispatch.tensor> + %5 = iree_tensor_ext.dispatch.tensor.load %3, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %6 = tensor.empty() : tensor<2xf32> + %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%5 : tensor<2xf32>) outs(%6 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %8 = math.absf %in : f32 + linalg.yield %8 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %7, %4, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return +} + +// -----// IR Dump After MaterializeUserConfigsPass (iree-codegen-materialize-user-configs) //----- // +module { + func.func @multiple_results_dispatch_0_elementwise_2_f32() { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %1 = arith.index_castui %0 : i32 to index + %2 = util.assume.int %1[, ] : index + %3 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor> + %4 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : !iree_tensor_ext.dispatch.tensor> + %5 = iree_tensor_ext.dispatch.tensor.load %3, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %6 = tensor.empty() : tensor<2xf32> + %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%5 : tensor<2xf32>) outs(%6 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %8 = math.absf %in : f32 + linalg.yield %8 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %7, %4, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } +} + +// -----// IR Dump After MaterializeDeviceEncodingPass (iree-codegen-materialize-device-encoding) //----- // +func.func @multiple_results_dispatch_0_elementwise_2_f32() { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %1 = arith.index_castui %0 : i32 to index + %2 = util.assume.int %1[, ] : index + %3 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor> + %4 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : !iree_tensor_ext.dispatch.tensor> + %5 = iree_tensor_ext.dispatch.tensor.load %3, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %6 = tensor.empty() : tensor<2xf32> + %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%5 : tensor<2xf32>) outs(%6 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %8 = math.absf %in : f32 + linalg.yield %8 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %7, %4, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return +} + +// -----// IR Dump After CPUPropagateDataLayoutPass (iree-codegen-cpu-propagate-data-layout) //----- // +func.func @multiple_results_dispatch_0_elementwise_2_f32() { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %1 = arith.index_castui %0 : i32 to index + %2 = util.assume.int %1[, ] : index + %3 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor> + %4 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : !iree_tensor_ext.dispatch.tensor> + %5 = iree_tensor_ext.dispatch.tensor.load %3, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %6 = tensor.empty() : tensor<2xf32> + %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%5 : tensor<2xf32>) outs(%6 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %8 = math.absf %in : f32 + linalg.yield %8 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %7, %4, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return +} + +// -----// IR Dump After RematerializeParallelOpsPass (iree-codegen-rematerialize-parallel-ops) //----- // +func.func @multiple_results_dispatch_0_elementwise_2_f32() { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %1 = arith.index_castui %0 : i32 to index + %2 = util.assume.int %1[, ] : index + %3 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor> + %4 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : !iree_tensor_ext.dispatch.tensor> + %5 = iree_tensor_ext.dispatch.tensor.load %3, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %6 = tensor.empty() : tensor<2xf32> + %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%5 : tensor<2xf32>) outs(%6 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %8 = math.absf %in : f32 + linalg.yield %8 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %7, %4, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return +} + +// -----// IR Dump After ExpandF16OpToF32Pass (iree-llvmcpu-expand-f16-op-to-f32) //----- // +func.func @multiple_results_dispatch_0_elementwise_2_f32() { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %1 = arith.index_castui %0 : i32 to index + %2 = util.assume.int %1[, ] : index + %3 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor> + %4 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : !iree_tensor_ext.dispatch.tensor> + %5 = iree_tensor_ext.dispatch.tensor.load %3, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %6 = tensor.empty() : tensor<2xf32> + %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%5 : tensor<2xf32>) outs(%6 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %8 = math.absf %in : f32 + linalg.yield %8 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %7, %4, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return +} + +// -----// IR Dump After ConvertAccGEMMToGEMMPass (iree-convert-accgemm-to-gemm) //----- // +func.func @multiple_results_dispatch_0_elementwise_2_f32() { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %1 = arith.index_castui %0 : i32 to index + %2 = util.assume.int %1[, ] : index + %3 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor> + %4 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : !iree_tensor_ext.dispatch.tensor> + %5 = iree_tensor_ext.dispatch.tensor.load %3, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %6 = tensor.empty() : tensor<2xf32> + %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%5 : tensor<2xf32>) outs(%6 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %8 = math.absf %in : f32 + linalg.yield %8 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %7, %4, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return +} + +// -----// IR Dump After EraseHALDescriptorTypeFromMemRefPass (iree-codegen-erase-hal-descriptor-type-from-memref) //----- // +func.func @multiple_results_dispatch_0_elementwise_2_f32() { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %1 = arith.index_castui %0 : i32 to index + %2 = util.assume.int %1[, ] : index + %3 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor> + %4 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : !iree_tensor_ext.dispatch.tensor> + %5 = iree_tensor_ext.dispatch.tensor.load %3, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %6 = tensor.empty() : tensor<2xf32> + %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%5 : tensor<2xf32>) outs(%6 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %8 = math.absf %in : f32 + linalg.yield %8 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %7, %4, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return +} + +// -----// IR Dump After LLVMCPUSelectLoweringStrategyPass (iree-llvmcpu-select-lowering-strategy) //----- // +module { + func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info} { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %1 = arith.index_castui %0 : i32 to index + %2 = util.assume.int %1[, ] : index + %3 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor> + %4 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : !iree_tensor_ext.dispatch.tensor> + %5 = iree_tensor_ext.dispatch.tensor.load %3, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %6 = tensor.empty() : tensor<2xf32> + %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%5 : tensor<2xf32>) outs(%6 : tensor<2xf32>) attrs = {lowering_config = #iree_cpu.lowering_config} { + ^bb0(%in: f32, %out: f32): + %8 = math.absf %in : f32 + linalg.yield %8 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %7, %4, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } +} + +// -----// IR Dump After ConfigureTargetExecutableVariantsPass (iree-hal-configure-target-executable-variants) //----- // +hal.executable.variant public @embedded_elf_arm_64 target(<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>) { + hal.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 ordinal(0) layout(#hal.pipeline.layout, #hal.pipeline.binding], flags = Indirect>) count(%arg0: !hal.device) -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + hal.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info} { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %1 = arith.index_castui %0 : i32 to index + %2 = util.assume.int %1[, ] : index + %3 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor> + %4 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : !iree_tensor_ext.dispatch.tensor> + %5 = iree_tensor_ext.dispatch.tensor.load %3, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %6 = tensor.empty() : tensor<2xf32> + %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%5 : tensor<2xf32>) outs(%6 : tensor<2xf32>) attrs = {lowering_config = #iree_cpu.lowering_config} { + ^bb0(%in: f32, %out: f32): + %8 = math.absf %in : f32 + linalg.yield %8 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %7, %4, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } +} + +// -----// IR Dump After ConfigureExecutablesPass (iree-hal-configure-executables) //----- // +hal.executable private @multiple_results_dispatch_0 { + hal.executable.variant public @embedded_elf_arm_64 target(<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>) { + hal.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 ordinal(0) layout(#hal.pipeline.layout, #hal.pipeline.binding], flags = Indirect>) count(%arg0: !hal.device) -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + hal.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info} { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %1 = arith.index_castui %0 : i32 to index + %2 = util.assume.int %1[, ] : index + %3 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor> + %4 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : !iree_tensor_ext.dispatch.tensor> + %5 = iree_tensor_ext.dispatch.tensor.load %3, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %6 = tensor.empty() : tensor<2xf32> + %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%5 : tensor<2xf32>) outs(%6 : tensor<2xf32>) attrs = {lowering_config = #iree_cpu.lowering_config} { + ^bb0(%in: f32, %out: f32): + %8 = math.absf %in : f32 + linalg.yield %8 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %7, %4, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } +} + +// -----// IR Dump After LowerExecutableUsingTransformDialectPass (iree-codegen-lower-executable-using-transform-dialect) //----- // +module { + func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info} { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %1 = arith.index_castui %0 : i32 to index + %2 = util.assume.int %1[, ] : index + %3 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor> + %4 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : !iree_tensor_ext.dispatch.tensor> + %5 = iree_tensor_ext.dispatch.tensor.load %3, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %6 = tensor.empty() : tensor<2xf32> + %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%5 : tensor<2xf32>) outs(%6 : tensor<2xf32>) attrs = {lowering_config = #iree_cpu.lowering_config} { + ^bb0(%in: f32, %out: f32): + %8 = math.absf %in : f32 + linalg.yield %8 : f32 + } -> tensor<2xf32> + iree_tensor_ext.dispatch.tensor.store %7, %4, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } +} + +// -----// IR Dump After TileAndDistributeToWorkgroupsUsingForallOpPass (iree-codegen-tile-and-distribute-to-workgroups-using-forall-op) //----- // +func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info} { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %1 = arith.index_castui %0 : i32 to index + %2 = util.assume.int %1[, ] : index + %3 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor> + %4 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : !iree_tensor_ext.dispatch.tensor> + %5 = iree_tensor_ext.dispatch.tensor.load %3, offsets = [0], sizes = [2], strides = [1] : !iree_tensor_ext.dispatch.tensor> -> tensor<2xf32> + %6 = tensor.empty() : tensor<2xf32> + %7 = scf.forall (%arg0) = (0) to (2) step (2) shared_outs(%arg1 = %6) -> (tensor<2xf32>) { + %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%5 : tensor<2xf32>) outs(%arg1 : tensor<2xf32>) attrs = {lowering_config = #iree_cpu.lowering_config} { + ^bb0(%in: f32, %out: f32): + %9 = math.absf %in : f32 + linalg.yield %9 : f32 + } -> tensor<2xf32> + scf.forall.in_parallel { + tensor.parallel_insert_slice %8 into %arg1[%c0] [2] [1] : tensor<2xf32> into tensor<2xf32> + } + } {mapping = [#iree_codegen.workgroup_mapping]} + iree_tensor_ext.dispatch.tensor.store %7, %4, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !iree_tensor_ext.dispatch.tensor> + return +} + +// -----// IR Dump After BufferizeDispatchTensorLoadStorePass (iree-codegen-bufferize-dispatch-tensor-load-store) //----- // +func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info} { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %1 = arith.index_castui %0 : i32 to index + %2 = util.assume.int %1[, ] : index + %3 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type> + %4 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor> + %5 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + %6 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : !iree_tensor_ext.dispatch.tensor> + %7 = iree_codegen.load_from_buffer %3 : memref<2xf32, #hal.descriptor_type> -> tensor<2xf32> + %8 = tensor.empty() : tensor<2xf32> + %9 = scf.forall (%arg0) = (0) to (2) step (2) shared_outs(%arg1 = %8) -> (tensor<2xf32>) { + %10 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%7 : tensor<2xf32>) outs(%arg1 : tensor<2xf32>) attrs = {lowering_config = #iree_cpu.lowering_config} { + ^bb0(%in: f32, %out: f32): + %11 = math.absf %in : f32 + linalg.yield %11 : f32 + } -> tensor<2xf32> + scf.forall.in_parallel { + tensor.parallel_insert_slice %10 into %arg1[%c0] [2] [1] : tensor<2xf32> into tensor<2xf32> + } + } {mapping = [#iree_codegen.workgroup_mapping]} + iree_codegen.store_to_buffer %9, %5 : tensor<2xf32> into memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + return +} + +// -----// IR Dump After CombineLayoutTransformationPass (iree-codegen-combine-layout-transformation) //----- // +func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info} { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %1 = arith.index_castui %0 : i32 to index + %2 = util.assume.int %1[, ] : index + %3 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type> + %4 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + %5 = iree_codegen.load_from_buffer %3 : memref<2xf32, #hal.descriptor_type> -> tensor<2xf32> + %6 = tensor.empty() : tensor<2xf32> + %7 = scf.forall (%arg0) = (0) to (2) step (2) shared_outs(%arg1 = %6) -> (tensor<2xf32>) { + %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%5 : tensor<2xf32>) outs(%arg1 : tensor<2xf32>) attrs = {lowering_config = #iree_cpu.lowering_config} { + ^bb0(%in: f32, %out: f32): + %9 = math.absf %in : f32 + linalg.yield %9 : f32 + } -> tensor<2xf32> + scf.forall.in_parallel { + tensor.parallel_insert_slice %8 into %arg1[%c0] [2] [1] : tensor<2xf32> into tensor<2xf32> + } + } {mapping = [#iree_codegen.workgroup_mapping]} + iree_codegen.store_to_buffer %7, %4 : tensor<2xf32> into memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + return +} + +// -----// IR Dump After ConfigTrackingCanonicalizerPass (iree-codegen-config-tracking-canonicalize) //----- // +func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info} { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %1 = arith.index_castui %0 : i32 to index + %2 = util.assume.int %1[, ] : index + %3 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type> + %4 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + %5 = iree_codegen.load_from_buffer %3 : memref<2xf32, #hal.descriptor_type> -> tensor<2xf32> + %6 = tensor.empty() : tensor<2xf32> + %7 = scf.forall (%arg0) = (0) to (2) step (2) shared_outs(%arg1 = %6) -> (tensor<2xf32>) { + %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%5 : tensor<2xf32>) outs(%arg1 : tensor<2xf32>) attrs = {lowering_config = #iree_cpu.lowering_config} { + ^bb0(%in: f32, %out: f32): + %9 = math.absf %in : f32 + linalg.yield %9 : f32 + } -> tensor<2xf32> + scf.forall.in_parallel { + tensor.parallel_insert_slice %8 into %arg1[0] [2] [1] : tensor<2xf32> into tensor<2xf32> + } + } {mapping = [#iree_codegen.workgroup_mapping]} + iree_codegen.store_to_buffer %7, %4 : tensor<2xf32> into memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + return +} + +// -----// IR Dump After CSE (cse) //----- // +func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info} { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %1 = arith.index_castui %0 : i32 to index + %2 = util.assume.int %1[, ] : index + %3 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type> + %4 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + %5 = iree_codegen.load_from_buffer %3 : memref<2xf32, #hal.descriptor_type> -> tensor<2xf32> + %6 = tensor.empty() : tensor<2xf32> + %7 = scf.forall (%arg0) = (0) to (2) step (2) shared_outs(%arg1 = %6) -> (tensor<2xf32>) { + %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%5 : tensor<2xf32>) outs(%arg1 : tensor<2xf32>) attrs = {lowering_config = #iree_cpu.lowering_config} { + ^bb0(%in: f32, %out: f32): + %9 = math.absf %in : f32 + linalg.yield %9 : f32 + } -> tensor<2xf32> + scf.forall.in_parallel { + tensor.parallel_insert_slice %8 into %arg1[0] [2] [1] : tensor<2xf32> into tensor<2xf32> + } + } {mapping = [#iree_codegen.workgroup_mapping]} + iree_codegen.store_to_buffer %7, %4 : tensor<2xf32> into memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + return +} + +// -----// IR Dump After FuseTensorPadWithConsumerPass (iree-codegen-fuse-tensor-pad-with-consumer) //----- // +func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info} { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %1 = arith.index_castui %0 : i32 to index + %2 = util.assume.int %1[, ] : index + %3 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type> + %4 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + %5 = iree_codegen.load_from_buffer %3 : memref<2xf32, #hal.descriptor_type> -> tensor<2xf32> + %6 = tensor.empty() : tensor<2xf32> + %7 = scf.forall (%arg0) = (0) to (2) step (2) shared_outs(%arg1 = %6) -> (tensor<2xf32>) { + %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%5 : tensor<2xf32>) outs(%arg1 : tensor<2xf32>) attrs = {lowering_config = #iree_cpu.lowering_config} { + ^bb0(%in: f32, %out: f32): + %9 = math.absf %in : f32 + linalg.yield %9 : f32 + } -> tensor<2xf32> + scf.forall.in_parallel { + tensor.parallel_insert_slice %8 into %arg1[0] [2] [1] : tensor<2xf32> into tensor<2xf32> + } + } {mapping = [#iree_codegen.workgroup_mapping]} + iree_codegen.store_to_buffer %7, %4 : tensor<2xf32> into memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + return +} + +// -----// IR Dump After ConcretizePadResultShapePass (iree-codegen-concretize-pad-result-shape) //----- // +func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info} { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %1 = arith.index_castui %0 : i32 to index + %2 = util.assume.int %1[, ] : index + %3 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type> + %4 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + %5 = iree_codegen.load_from_buffer %3 : memref<2xf32, #hal.descriptor_type> -> tensor<2xf32> + %6 = tensor.empty() : tensor<2xf32> + %7 = scf.forall (%arg0) = (0) to (2) step (2) shared_outs(%arg1 = %6) -> (tensor<2xf32>) { + %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%5 : tensor<2xf32>) outs(%arg1 : tensor<2xf32>) attrs = {lowering_config = #iree_cpu.lowering_config} { + ^bb0(%in: f32, %out: f32): + %9 = math.absf %in : f32 + linalg.yield %9 : f32 + } -> tensor<2xf32> + scf.forall.in_parallel { + tensor.parallel_insert_slice %8 into %arg1[0] [2] [1] : tensor<2xf32> into tensor<2xf32> + } + } {mapping = [#iree_codegen.workgroup_mapping]} + iree_codegen.store_to_buffer %7, %4 : tensor<2xf32> into memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + return +} + +// -----// IR Dump After PropagateDispatchSizeBoundsPass (iree-codegen-propagate-dispatch-size-bounds) //----- // +func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info} { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %1 = arith.index_castui %0 : i32 to index + %2 = util.assume.int %1[, ] : index + %3 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type> + %4 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + %5 = iree_codegen.load_from_buffer %3 : memref<2xf32, #hal.descriptor_type> -> tensor<2xf32> + %6 = tensor.empty() : tensor<2xf32> + %7 = scf.forall (%arg0) = (0) to (2) step (2) shared_outs(%arg1 = %6) -> (tensor<2xf32>) { + %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%5 : tensor<2xf32>) outs(%arg1 : tensor<2xf32>) attrs = {lowering_config = #iree_cpu.lowering_config} { + ^bb0(%in: f32, %out: f32): + %9 = math.absf %in : f32 + linalg.yield %9 : f32 + } -> tensor<2xf32> + scf.forall.in_parallel { + tensor.parallel_insert_slice %8 into %arg1[0] [2] [1] : tensor<2xf32> into tensor<2xf32> + } + } {mapping = [#iree_codegen.workgroup_mapping]} + iree_codegen.store_to_buffer %7, %4 : tensor<2xf32> into memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + return +} + +// -----// IR Dump After LLVMCPUTileAndFuseProducerConsumerPass (iree-llvmcpu-tile-and-fuse-producer-consumer) //----- // +func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info} { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %1 = arith.index_castui %0 : i32 to index + %2 = util.assume.int %1[, ] : index + %3 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type> + %4 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + %5 = iree_codegen.load_from_buffer %3 : memref<2xf32, #hal.descriptor_type> -> tensor<2xf32> + %6 = tensor.empty() : tensor<2xf32> + %7 = scf.forall (%arg0) = (0) to (2) step (2) shared_outs(%arg1 = %6) -> (tensor<2xf32>) { + %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%5 : tensor<2xf32>) outs(%arg1 : tensor<2xf32>) attrs = {lowering_config = #iree_cpu.lowering_config} { + ^bb0(%in: f32, %out: f32): + %9 = math.absf %in : f32 + linalg.yield %9 : f32 + } -> tensor<2xf32> + scf.forall.in_parallel { + tensor.parallel_insert_slice %8 into %arg1[0] [2] [1] : tensor<2xf32> into tensor<2xf32> + } + } {mapping = [#iree_codegen.workgroup_mapping]} + iree_codegen.store_to_buffer %7, %4 : tensor<2xf32> into memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + return +} + +// -----// IR Dump After FuseTensorPadWithConsumerPass (iree-codegen-fuse-tensor-pad-with-consumer) //----- // +func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info} { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %1 = arith.index_castui %0 : i32 to index + %2 = util.assume.int %1[, ] : index + %3 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type> + %4 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + %5 = iree_codegen.load_from_buffer %3 : memref<2xf32, #hal.descriptor_type> -> tensor<2xf32> + %6 = tensor.empty() : tensor<2xf32> + %7 = scf.forall (%arg0) = (0) to (2) step (2) shared_outs(%arg1 = %6) -> (tensor<2xf32>) { + %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%5 : tensor<2xf32>) outs(%arg1 : tensor<2xf32>) attrs = {lowering_config = #iree_cpu.lowering_config} { + ^bb0(%in: f32, %out: f32): + %9 = math.absf %in : f32 + linalg.yield %9 : f32 + } -> tensor<2xf32> + scf.forall.in_parallel { + tensor.parallel_insert_slice %8 into %arg1[0] [2] [1] : tensor<2xf32> into tensor<2xf32> + } + } {mapping = [#iree_codegen.workgroup_mapping]} + iree_codegen.store_to_buffer %7, %4 : tensor<2xf32> into memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + return +} + +// -----// IR Dump After ConcretizePadResultShapePass (iree-codegen-concretize-pad-result-shape) //----- // +func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info} { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %1 = arith.index_castui %0 : i32 to index + %2 = util.assume.int %1[, ] : index + %3 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type> + %4 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + %5 = iree_codegen.load_from_buffer %3 : memref<2xf32, #hal.descriptor_type> -> tensor<2xf32> + %6 = tensor.empty() : tensor<2xf32> + %7 = scf.forall (%arg0) = (0) to (2) step (2) shared_outs(%arg1 = %6) -> (tensor<2xf32>) { + %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%5 : tensor<2xf32>) outs(%arg1 : tensor<2xf32>) attrs = {lowering_config = #iree_cpu.lowering_config} { + ^bb0(%in: f32, %out: f32): + %9 = math.absf %in : f32 + linalg.yield %9 : f32 + } -> tensor<2xf32> + scf.forall.in_parallel { + tensor.parallel_insert_slice %8 into %arg1[0] [2] [1] : tensor<2xf32> into tensor<2xf32> + } + } {mapping = [#iree_codegen.workgroup_mapping]} + iree_codegen.store_to_buffer %7, %4 : tensor<2xf32> into memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + return +} + +// -----// IR Dump After LLVMCPUTileAndFuseProducerConsumerPass (iree-llvmcpu-tile-and-fuse-producer-consumer) //----- // +func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info} { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %1 = arith.index_castui %0 : i32 to index + %2 = util.assume.int %1[, ] : index + %3 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type> + %4 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + %5 = iree_codegen.load_from_buffer %3 : memref<2xf32, #hal.descriptor_type> -> tensor<2xf32> + %6 = tensor.empty() : tensor<2xf32> + %7 = scf.forall (%arg0) = (0) to (2) step (2) shared_outs(%arg1 = %6) -> (tensor<2xf32>) { + %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%5 : tensor<2xf32>) outs(%arg1 : tensor<2xf32>) attrs = {lowering_config = #iree_cpu.lowering_config} { + ^bb0(%in: f32, %out: f32): + %9 = math.absf %in : f32 + linalg.yield %9 : f32 + } -> tensor<2xf32> + scf.forall.in_parallel { + tensor.parallel_insert_slice %8 into %arg1[0] [2] [1] : tensor<2xf32> into tensor<2xf32> + } + } {mapping = [#iree_codegen.workgroup_mapping]} + iree_codegen.store_to_buffer %7, %4 : tensor<2xf32> into memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + return +} + +// -----// IR Dump After FuseTensorPadWithConsumerPass (iree-codegen-fuse-tensor-pad-with-consumer) //----- // +func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info} { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %1 = arith.index_castui %0 : i32 to index + %2 = util.assume.int %1[, ] : index + %3 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type> + %4 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + %5 = iree_codegen.load_from_buffer %3 : memref<2xf32, #hal.descriptor_type> -> tensor<2xf32> + %6 = tensor.empty() : tensor<2xf32> + %7 = scf.forall (%arg0) = (0) to (2) step (2) shared_outs(%arg1 = %6) -> (tensor<2xf32>) { + %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%5 : tensor<2xf32>) outs(%arg1 : tensor<2xf32>) attrs = {lowering_config = #iree_cpu.lowering_config} { + ^bb0(%in: f32, %out: f32): + %9 = math.absf %in : f32 + linalg.yield %9 : f32 + } -> tensor<2xf32> + scf.forall.in_parallel { + tensor.parallel_insert_slice %8 into %arg1[0] [2] [1] : tensor<2xf32> into tensor<2xf32> + } + } {mapping = [#iree_codegen.workgroup_mapping]} + iree_codegen.store_to_buffer %7, %4 : tensor<2xf32> into memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + return +} + +// -----// IR Dump After ConcretizePadResultShapePass (iree-codegen-concretize-pad-result-shape) //----- // +func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info} { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %1 = arith.index_castui %0 : i32 to index + %2 = util.assume.int %1[, ] : index + %3 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type> + %4 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + %5 = iree_codegen.load_from_buffer %3 : memref<2xf32, #hal.descriptor_type> -> tensor<2xf32> + %6 = tensor.empty() : tensor<2xf32> + %7 = scf.forall (%arg0) = (0) to (2) step (2) shared_outs(%arg1 = %6) -> (tensor<2xf32>) { + %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%5 : tensor<2xf32>) outs(%arg1 : tensor<2xf32>) attrs = {lowering_config = #iree_cpu.lowering_config} { + ^bb0(%in: f32, %out: f32): + %9 = math.absf %in : f32 + linalg.yield %9 : f32 + } -> tensor<2xf32> + scf.forall.in_parallel { + tensor.parallel_insert_slice %8 into %arg1[0] [2] [1] : tensor<2xf32> into tensor<2xf32> + } + } {mapping = [#iree_codegen.workgroup_mapping]} + iree_codegen.store_to_buffer %7, %4 : tensor<2xf32> into memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + return +} + +// -----// IR Dump After ForallToForPass (iree-codegen-forall-to-for) //----- // +func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info} { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %1 = arith.index_castui %0 : i32 to index + %2 = util.assume.int %1[, ] : index + %3 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type> + %4 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + %5 = iree_codegen.load_from_buffer %3 : memref<2xf32, #hal.descriptor_type> -> tensor<2xf32> + %6 = tensor.empty() : tensor<2xf32> + %7 = scf.forall (%arg0) = (0) to (2) step (2) shared_outs(%arg1 = %6) -> (tensor<2xf32>) { + %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%5 : tensor<2xf32>) outs(%arg1 : tensor<2xf32>) attrs = {lowering_config = #iree_cpu.lowering_config} { + ^bb0(%in: f32, %out: f32): + %9 = math.absf %in : f32 + linalg.yield %9 : f32 + } -> tensor<2xf32> + scf.forall.in_parallel { + tensor.parallel_insert_slice %8 into %arg1[0] [2] [1] : tensor<2xf32> into tensor<2xf32> + } + } {mapping = [#iree_codegen.workgroup_mapping]} + iree_codegen.store_to_buffer %7, %4 : tensor<2xf32> into memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + return +} + +// -----// IR Dump After LLVMCPUPeelPass (iree-llvmcpu-peel) //----- // +func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info} { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %1 = arith.index_castui %0 : i32 to index + %2 = util.assume.int %1[, ] : index + %3 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type> + %4 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + %5 = iree_codegen.load_from_buffer %3 : memref<2xf32, #hal.descriptor_type> -> tensor<2xf32> + %6 = tensor.empty() : tensor<2xf32> + %7 = scf.forall (%arg0) = (0) to (2) step (2) shared_outs(%arg1 = %6) -> (tensor<2xf32>) { + %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%5 : tensor<2xf32>) outs(%arg1 : tensor<2xf32>) attrs = {lowering_config = #iree_cpu.lowering_config} { + ^bb0(%in: f32, %out: f32): + %9 = math.absf %in : f32 + linalg.yield %9 : f32 + } -> tensor<2xf32> + scf.forall.in_parallel { + tensor.parallel_insert_slice %8 into %arg1[0] [2] [1] : tensor<2xf32> into tensor<2xf32> + } + } {mapping = [#iree_codegen.workgroup_mapping]} + iree_codegen.store_to_buffer %7, %4 : tensor<2xf32> into memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + return +} + +// -----// IR Dump After TensorToVectorVectorizePadPass (iree-codegen-vectorize-tensor-pad) //----- // +func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info} { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %1 = arith.index_castui %0 : i32 to index + %2 = util.assume.int %1[, ] : index + %3 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type> + %4 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + %5 = iree_codegen.load_from_buffer %3 : memref<2xf32, #hal.descriptor_type> -> tensor<2xf32> + %6 = tensor.empty() : tensor<2xf32> + %7 = scf.forall (%arg0) = (0) to (2) step (2) shared_outs(%arg1 = %6) -> (tensor<2xf32>) { + %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%5 : tensor<2xf32>) outs(%arg1 : tensor<2xf32>) attrs = {lowering_config = #iree_cpu.lowering_config} { + ^bb0(%in: f32, %out: f32): + %9 = math.absf %in : f32 + linalg.yield %9 : f32 + } -> tensor<2xf32> + scf.forall.in_parallel { + tensor.parallel_insert_slice %8 into %arg1[0] [2] [1] : tensor<2xf32> into tensor<2xf32> + } + } {mapping = [#iree_codegen.workgroup_mapping]} + iree_codegen.store_to_buffer %7, %4 : tensor<2xf32> into memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + return +} + +// -----// IR Dump After LLVMCPUTileToVectorSizePass (iree-llvmcpu-tile-to-vector-size) //----- // +func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info} { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %1 = arith.index_castui %0 : i32 to index + %2 = util.assume.int %1[, ] : index + %3 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type> + %4 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + %5 = iree_codegen.load_from_buffer %3 : memref<2xf32, #hal.descriptor_type> -> tensor<2xf32> + %6 = tensor.empty() : tensor<2xf32> + %7 = scf.forall (%arg0) = (0) to (2) step (2) shared_outs(%arg1 = %6) -> (tensor<2xf32>) { + %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%5 : tensor<2xf32>) outs(%arg1 : tensor<2xf32>) attrs = {lowering_config = #iree_cpu.lowering_config} { + ^bb0(%in: f32, %out: f32): + %9 = math.absf %in : f32 + linalg.yield %9 : f32 + } -> tensor<2xf32> + scf.forall.in_parallel { + tensor.parallel_insert_slice %8 into %arg1[0] [2] [1] : tensor<2xf32> into tensor<2xf32> + } + } {mapping = [#iree_codegen.workgroup_mapping]} + iree_codegen.store_to_buffer %7, %4 : tensor<2xf32> into memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + return +} + +// -----// IR Dump After GenericVectorizationPass (iree-codegen-generic-vectorization) //----- // +func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info} { + %0 = ub.poison : f32 + %c0 = arith.constant 0 : index + %1 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %2 = arith.index_castui %1 : i32 to index + %3 = util.assume.int %2[, ] : index + %4 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type> + %5 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%3) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + %6 = iree_codegen.load_from_buffer %4 : memref<2xf32, #hal.descriptor_type> -> tensor<2xf32> + %7 = tensor.empty() : tensor<2xf32> + %8 = scf.forall (%arg0) = (0) to (2) step (2) shared_outs(%arg1 = %7) -> (tensor<2xf32>) { + %9 = vector.transfer_read %6[%c0], %0 {in_bounds = [true]} : tensor<2xf32>, vector<2xf32> + %10 = math.absf %9 : vector<2xf32> + %11 = vector.transfer_write %10, %arg1[%c0] {in_bounds = [true]} : vector<2xf32>, tensor<2xf32> + scf.forall.in_parallel { + tensor.parallel_insert_slice %11 into %arg1[0] [2] [1] : tensor<2xf32> into tensor<2xf32> + } + } {mapping = [#iree_codegen.workgroup_mapping]} + iree_codegen.store_to_buffer %8, %5 : tensor<2xf32> into memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + return +} + +// -----// IR Dump After OptimizeTensorInsertExtractSlicesPass (iree-codegen-optimize-tensor-insert-extract-slices) //----- // +func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info} { + %0 = ub.poison : f32 + %c0 = arith.constant 0 : index + %1 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %2 = arith.index_castui %1 : i32 to index + %3 = util.assume.int %2[, ] : index + %4 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type> + %5 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%3) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + %6 = iree_codegen.load_from_buffer %4 : memref<2xf32, #hal.descriptor_type> -> tensor<2xf32> + %7 = tensor.empty() : tensor<2xf32> + %8 = scf.forall (%arg0) = (0) to (2) step (2) shared_outs(%arg1 = %7) -> (tensor<2xf32>) { + %9 = vector.transfer_read %6[%c0], %0 {in_bounds = [true]} : tensor<2xf32>, vector<2xf32> + %10 = math.absf %9 : vector<2xf32> + %11 = vector.transfer_write %10, %arg1[%c0] {in_bounds = [true]} : vector<2xf32>, tensor<2xf32> + scf.forall.in_parallel { + tensor.parallel_insert_slice %11 into %arg1[0] [2] [1] : tensor<2xf32> into tensor<2xf32> + } + } {mapping = [#iree_codegen.workgroup_mapping]} + iree_codegen.store_to_buffer %8, %5 : tensor<2xf32> into memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + return +} + +// -----// IR Dump After Canonicalizer (canonicalize) //----- // +func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info} { + %0 = ub.poison : f32 + %c0 = arith.constant 0 : index + %1 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %2 = arith.index_castui %1 : i32 to index + %3 = util.assume.int %2[, ] : index + %4 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type> + %5 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%3) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + %6 = iree_codegen.load_from_buffer %4 : memref<2xf32, #hal.descriptor_type> -> tensor<2xf32> + %7 = tensor.empty() : tensor<2xf32> + %8 = scf.forall (%arg0) = (0) to (2) step (2) shared_outs(%arg1 = %7) -> (tensor<2xf32>) { + %9 = vector.transfer_read %6[%c0], %0 {in_bounds = [true]} : tensor<2xf32>, vector<2xf32> + %10 = math.absf %9 : vector<2xf32> + %11 = vector.transfer_write %10, %arg1[%c0] {in_bounds = [true]} : vector<2xf32>, tensor<2xf32> + scf.forall.in_parallel { + tensor.parallel_insert_slice %11 into %arg1[0] [2] [1] : tensor<2xf32> into tensor<2xf32> + } + } {mapping = [#iree_codegen.workgroup_mapping]} + iree_codegen.store_to_buffer %8, %5 : tensor<2xf32> into memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + return +} + +// -----// IR Dump After CSE (cse) //----- // +func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info} { + %0 = ub.poison : f32 + %c0 = arith.constant 0 : index + %1 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %2 = arith.index_castui %1 : i32 to index + %3 = util.assume.int %2[, ] : index + %4 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type> + %5 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%3) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + %6 = iree_codegen.load_from_buffer %4 : memref<2xf32, #hal.descriptor_type> -> tensor<2xf32> + %7 = tensor.empty() : tensor<2xf32> + %8 = scf.forall (%arg0) = (0) to (2) step (2) shared_outs(%arg1 = %7) -> (tensor<2xf32>) { + %9 = vector.transfer_read %6[%c0], %0 {in_bounds = [true]} : tensor<2xf32>, vector<2xf32> + %10 = math.absf %9 : vector<2xf32> + %11 = vector.transfer_write %10, %arg1[%c0] {in_bounds = [true]} : vector<2xf32>, tensor<2xf32> + scf.forall.in_parallel { + tensor.parallel_insert_slice %11 into %arg1[0] [2] [1] : tensor<2xf32> into tensor<2xf32> + } + } {mapping = [#iree_codegen.workgroup_mapping]} + iree_codegen.store_to_buffer %8, %5 : tensor<2xf32> into memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + return +} + +// -----// IR Dump After LLVMCPUVerifyVectorSizeLegalityPass (iree-llvmcpu-verify-vector-size-legality) //----- // +func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info} { + %0 = ub.poison : f32 + %c0 = arith.constant 0 : index + %1 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %2 = arith.index_castui %1 : i32 to index + %3 = util.assume.int %2[, ] : index + %4 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type> + %5 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%3) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + %6 = iree_codegen.load_from_buffer %4 : memref<2xf32, #hal.descriptor_type> -> tensor<2xf32> + %7 = tensor.empty() : tensor<2xf32> + %8 = scf.forall (%arg0) = (0) to (2) step (2) shared_outs(%arg1 = %7) -> (tensor<2xf32>) { + %9 = vector.transfer_read %6[%c0], %0 {in_bounds = [true]} : tensor<2xf32>, vector<2xf32> + %10 = math.absf %9 : vector<2xf32> + %11 = vector.transfer_write %10, %arg1[%c0] {in_bounds = [true]} : vector<2xf32>, tensor<2xf32> + scf.forall.in_parallel { + tensor.parallel_insert_slice %11 into %arg1[0] [2] [1] : tensor<2xf32> into tensor<2xf32> + } + } {mapping = [#iree_codegen.workgroup_mapping]} + iree_codegen.store_to_buffer %8, %5 : tensor<2xf32> into memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + return +} + +// -----// IR Dump After EliminateEmptyTensorsPass (iree-eliminate-empty-tensors) //----- // +func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info} { + %0 = ub.poison : f32 + %c0 = arith.constant 0 : index + %1 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %2 = arith.index_castui %1 : i32 to index + %3 = util.assume.int %2[, ] : index + %4 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type> + %5 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%3) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + %6 = iree_codegen.load_from_buffer %4 : memref<2xf32, #hal.descriptor_type> -> tensor<2xf32> + %7 = iree_codegen.load_from_buffer %5 : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> -> tensor<2xf32> + %8 = tensor.empty() : tensor<2xf32> + %9 = scf.forall (%arg0) = (0) to (2) step (2) shared_outs(%arg1 = %7) -> (tensor<2xf32>) { + %10 = vector.transfer_read %6[%c0], %0 {in_bounds = [true]} : tensor<2xf32>, vector<2xf32> + %11 = math.absf %10 : vector<2xf32> + %12 = vector.transfer_write %11, %arg1[%c0] {in_bounds = [true]} : vector<2xf32>, tensor<2xf32> + scf.forall.in_parallel { + tensor.parallel_insert_slice %12 into %arg1[0] [2] [1] : tensor<2xf32> into tensor<2xf32> + } + } {mapping = [#iree_codegen.workgroup_mapping]} + iree_codegen.store_to_buffer %9, %5 : tensor<2xf32> into memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + return +} + +// -----// IR Dump After EmptyTensorToAllocTensorPass (empty-tensor-to-alloc-tensor) //----- // +func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info} { + %0 = ub.poison : f32 + %c0 = arith.constant 0 : index + %1 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %2 = arith.index_castui %1 : i32 to index + %3 = util.assume.int %2[, ] : index + %4 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type> + %5 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%3) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + %6 = iree_codegen.load_from_buffer %4 : memref<2xf32, #hal.descriptor_type> -> tensor<2xf32> + %7 = iree_codegen.load_from_buffer %5 : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> -> tensor<2xf32> + %8 = scf.forall (%arg0) = (0) to (2) step (2) shared_outs(%arg1 = %7) -> (tensor<2xf32>) { + %9 = vector.transfer_read %6[%c0], %0 {in_bounds = [true]} : tensor<2xf32>, vector<2xf32> + %10 = math.absf %9 : vector<2xf32> + %11 = vector.transfer_write %10, %arg1[%c0] {in_bounds = [true]} : vector<2xf32>, tensor<2xf32> + scf.forall.in_parallel { + tensor.parallel_insert_slice %11 into %arg1[0] [2] [1] : tensor<2xf32> into tensor<2xf32> + } + } {mapping = [#iree_codegen.workgroup_mapping]} + iree_codegen.store_to_buffer %8, %5 : tensor<2xf32> into memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + return +} + +// -----// IR Dump After IREEComprehensiveBufferizePass (iree-codegen-iree-comprehensive-bufferize) //----- // +func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info} { + %0 = ub.poison : f32 + %c0 = arith.constant 0 : index + %1 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %2 = arith.index_castui %1 : i32 to index + %3 = util.assume.int %2[, ] : index + %4 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type> + %5 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%3) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + scf.forall (%arg0) = (0) to (2) step (2) { + %6 = vector.transfer_read %4[%c0], %0 {in_bounds = [true]} : memref<2xf32, #hal.descriptor_type>, vector<2xf32> + %7 = math.absf %6 : vector<2xf32> + vector.transfer_write %7, %5[%c0] {in_bounds = [true]} : vector<2xf32>, memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + %subview = memref.subview %5[0] [2] [1] : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> to memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%5 : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type>) outs(%subview : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } + } {mapping = [#iree_codegen.workgroup_mapping]} + linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%5 : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type>) outs(%5 : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } + return +} + +// -----// IR Dump After IREEInjectAssumeAlignmentPass (iree-codegen-inject-assume-alignment) //----- // +func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info} { + %0 = ub.poison : f32 + %c0 = arith.constant 0 : index + %1 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %2 = arith.index_castui %1 : i32 to index + %3 = util.assume.int %2[, ] : index + %4 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type> + %assume_align = memref.assume_alignment %4, 64 : memref<2xf32, #hal.descriptor_type> + %5 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%3) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + %assume_align_0 = memref.assume_alignment %5, 64 : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + scf.forall (%arg0) = (0) to (2) step (2) { + %6 = vector.transfer_read %assume_align[%c0], %0 {in_bounds = [true]} : memref<2xf32, #hal.descriptor_type>, vector<2xf32> + %7 = math.absf %6 : vector<2xf32> + vector.transfer_write %7, %assume_align_0[%c0] {in_bounds = [true]} : vector<2xf32>, memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + %subview = memref.subview %assume_align_0[0] [2] [1] : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> to memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%assume_align_0 : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type>) outs(%subview : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } + } {mapping = [#iree_codegen.workgroup_mapping]} + linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%assume_align_0 : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type>) outs(%assume_align_0 : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } + return +} + +// -----// IR Dump After ResolveShapedTypeResultDimsPass (resolve-shaped-type-result-dims) //----- // +func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info} { + %0 = ub.poison : f32 + %c0 = arith.constant 0 : index + %1 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %2 = arith.index_castui %1 : i32 to index + %3 = util.assume.int %2[, ] : index + %4 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type> + %assume_align = memref.assume_alignment %4, 64 : memref<2xf32, #hal.descriptor_type> + %5 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%3) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + %assume_align_0 = memref.assume_alignment %5, 64 : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + scf.forall (%arg0) = (0) to (2) step (2) { + %6 = vector.transfer_read %assume_align[%c0], %0 {in_bounds = [true]} : memref<2xf32, #hal.descriptor_type>, vector<2xf32> + %7 = math.absf %6 : vector<2xf32> + vector.transfer_write %7, %assume_align_0[%c0] {in_bounds = [true]} : vector<2xf32>, memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + %subview = memref.subview %assume_align_0[0] [2] [1] : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> to memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%assume_align_0 : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type>) outs(%subview : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } + } {mapping = [#iree_codegen.workgroup_mapping]} + linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%assume_align_0 : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type>) outs(%assume_align_0 : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } + return +} + +// -----// IR Dump After IREECodegenCanonicalizerPass (iree-codegen-canonicalize) //----- // +func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info} { + %0 = ub.poison : f32 + %c0 = arith.constant 0 : index + %1 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %2 = arith.index_castui %1 : i32 to index + %3 = util.assume.int %2[, ] : index + %4 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type> + %assume_align = memref.assume_alignment %4, 64 : memref<2xf32, #hal.descriptor_type> + %5 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%3) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + %assume_align_0 = memref.assume_alignment %5, 64 : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + scf.forall (%arg0) = (0) to (2) step (2) { + %6 = vector.transfer_read %assume_align[%c0], %0 {in_bounds = [true]} : memref<2xf32, #hal.descriptor_type>, vector<2xf32> + %7 = math.absf %6 : vector<2xf32> + vector.transfer_write %7, %assume_align_0[%c0] {in_bounds = [true]} : vector<2xf32>, memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + } {mapping = [#iree_codegen.workgroup_mapping]} + return +} + +// -----// IR Dump After CSE (cse) //----- // +func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info} { + %0 = ub.poison : f32 + %c0 = arith.constant 0 : index + %1 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %2 = arith.index_castui %1 : i32 to index + %3 = util.assume.int %2[, ] : index + %4 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type> + %assume_align = memref.assume_alignment %4, 64 : memref<2xf32, #hal.descriptor_type> + %5 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%3) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + %assume_align_0 = memref.assume_alignment %5, 64 : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + scf.forall (%arg0) = (0) to (2) step (2) { + %6 = vector.transfer_read %assume_align[%c0], %0 {in_bounds = [true]} : memref<2xf32, #hal.descriptor_type>, vector<2xf32> + %7 = math.absf %6 : vector<2xf32> + vector.transfer_write %7, %assume_align_0[%c0] {in_bounds = [true]} : vector<2xf32>, memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + } {mapping = [#iree_codegen.workgroup_mapping]} + return +} + +// -----// IR Dump After IREECodegenCanonicalizerPass (iree-codegen-canonicalize) //----- // +func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info} { + %0 = ub.poison : f32 + %c0 = arith.constant 0 : index + %1 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %2 = arith.index_castui %1 : i32 to index + %3 = util.assume.int %2[, ] : index + %4 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type> + %assume_align = memref.assume_alignment %4, 64 : memref<2xf32, #hal.descriptor_type> + %5 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%3) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + %assume_align_0 = memref.assume_alignment %5, 64 : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + scf.forall (%arg0) = (0) to (2) step (2) { + %6 = vector.transfer_read %assume_align[%c0], %0 {in_bounds = [true]} : memref<2xf32, #hal.descriptor_type>, vector<2xf32> + %7 = math.absf %6 : vector<2xf32> + vector.transfer_write %7, %assume_align_0[%c0] {in_bounds = [true]} : vector<2xf32>, memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + } {mapping = [#iree_codegen.workgroup_mapping]} + return +} + +// -----// IR Dump After CleanupBufferAllocViewPass (iree-codegen-cleanup-buffer-alloc-view) //----- // +func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info} { + %0 = ub.poison : f32 + %c0 = arith.constant 0 : index + %1 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %2 = arith.index_castui %1 : i32 to index + %3 = util.assume.int %2[, ] : index + %4 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type> + %assume_align = memref.assume_alignment %4, 64 : memref<2xf32, #hal.descriptor_type> + %5 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%3) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + %assume_align_0 = memref.assume_alignment %5, 64 : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + scf.forall (%arg0) = (0) to (2) step (2) { + %6 = vector.transfer_read %assume_align[%c0], %0 {in_bounds = [true]} : memref<2xf32, #hal.descriptor_type>, vector<2xf32> + %7 = math.absf %6 : vector<2xf32> + vector.transfer_write %7, %assume_align_0[%c0] {in_bounds = [true]} : vector<2xf32>, memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + } {mapping = [#iree_codegen.workgroup_mapping]} + return +} + +// -----// IR Dump After PropagateDispatchSizeBoundsPass (iree-codegen-propagate-dispatch-size-bounds) //----- // +func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info} { + %0 = ub.poison : f32 + %c0 = arith.constant 0 : index + %1 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %2 = arith.index_castui %1 : i32 to index + %3 = util.assume.int %2[, ] : index + %4 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type> + %assume_align = memref.assume_alignment %4, 64 : memref<2xf32, #hal.descriptor_type> + %5 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%3) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + %assume_align_0 = memref.assume_alignment %5, 64 : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + scf.forall (%arg0) = (0) to (2) step (2) { + %6 = vector.transfer_read %assume_align[%c0], %0 {in_bounds = [true]} : memref<2xf32, #hal.descriptor_type>, vector<2xf32> + %7 = math.absf %6 : vector<2xf32> + vector.transfer_write %7, %assume_align_0[%c0] {in_bounds = [true]} : vector<2xf32>, memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + } {mapping = [#iree_codegen.workgroup_mapping]} + return +} + +// -----// IR Dump After RemoveSingleIterationLoopPass (iree-codegen-remove-single-iteration-loop) //----- // +func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info} { + %0 = ub.poison : f32 + %c0 = arith.constant 0 : index + %1 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %2 = arith.index_castui %1 : i32 to index + %3 = util.assume.int %2[, ] : index + %4 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type> + %assume_align = memref.assume_alignment %4, 64 : memref<2xf32, #hal.descriptor_type> + %5 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%3) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + %assume_align_0 = memref.assume_alignment %5, 64 : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + scf.forall (%arg0) = (0) to (2) step (2) { + %6 = vector.transfer_read %assume_align[%c0], %0 {in_bounds = [true]} : memref<2xf32, #hal.descriptor_type>, vector<2xf32> + %7 = math.absf %6 : vector<2xf32> + vector.transfer_write %7, %assume_align_0[%c0] {in_bounds = [true]} : vector<2xf32>, memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + } {mapping = [#iree_codegen.workgroup_mapping]} + return +} + +// -----// IR Dump After DropVectorUnitDimsPass (iree-codegen-drop-vector-unit-dims) //----- // +func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info} { + %0 = ub.poison : f32 + %c0 = arith.constant 0 : index + %1 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %2 = arith.index_castui %1 : i32 to index + %3 = util.assume.int %2[, ] : index + %4 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type> + %assume_align = memref.assume_alignment %4, 64 : memref<2xf32, #hal.descriptor_type> + %5 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%3) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + %assume_align_0 = memref.assume_alignment %5, 64 : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + scf.forall (%arg0) = (0) to (2) step (2) { + %6 = vector.transfer_read %assume_align[%c0], %0 {in_bounds = [true]} : memref<2xf32, #hal.descriptor_type>, vector<2xf32> + %7 = math.absf %6 : vector<2xf32> + vector.transfer_write %7, %assume_align_0[%c0] {in_bounds = [true]} : vector<2xf32>, memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + } {mapping = [#iree_codegen.workgroup_mapping]} + return +} + +// -----// IR Dump After LLVMCPUVirtualVectorLoweringPass (iree-llvmcpu-virtual-vector-lowering) //----- // +func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info} { + %0 = ub.poison : f32 + %c0 = arith.constant 0 : index + %1 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %2 = arith.index_castui %1 : i32 to index + %3 = util.assume.int %2[, ] : index + %4 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type> + %assume_align = memref.assume_alignment %4, 64 : memref<2xf32, #hal.descriptor_type> + %5 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%3) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + %assume_align_0 = memref.assume_alignment %5, 64 : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + scf.forall (%arg0) = (0) to (2) step (2) { + %6 = vector.transfer_read %assume_align[%c0], %0 {in_bounds = [true]} : memref<2xf32, #hal.descriptor_type>, vector<2xf32> + %7 = math.absf %6 : vector<2xf32> + vector.transfer_write %7, %assume_align_0[%c0] {in_bounds = [true]} : vector<2xf32>, memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + } {mapping = [#iree_codegen.workgroup_mapping]} + return +} + +// -----// IR Dump After Canonicalizer (canonicalize) //----- // +func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info} { + %0 = ub.poison : f32 + %c0 = arith.constant 0 : index + %1 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %2 = arith.index_castui %1 : i32 to index + %3 = util.assume.int %2[, ] : index + %4 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type> + %assume_align = memref.assume_alignment %4, 64 : memref<2xf32, #hal.descriptor_type> + %5 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%3) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + %assume_align_0 = memref.assume_alignment %5, 64 : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + scf.forall (%arg0) = (0) to (2) step (2) { + %6 = vector.transfer_read %assume_align[%c0], %0 {in_bounds = [true]} : memref<2xf32, #hal.descriptor_type>, vector<2xf32> + %7 = math.absf %6 : vector<2xf32> + vector.transfer_write %7, %assume_align_0[%c0] {in_bounds = [true]} : vector<2xf32>, memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + } {mapping = [#iree_codegen.workgroup_mapping]} + return +} + +// -----// IR Dump After VectorTransferLoweringPass (iree-codegen-vector-transfer-lowering) //----- // +func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info} { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %1 = arith.index_castui %0 : i32 to index + %2 = util.assume.int %1[, ] : index + %3 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type> + %assume_align = memref.assume_alignment %3, 64 : memref<2xf32, #hal.descriptor_type> + %4 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + %assume_align_0 = memref.assume_alignment %4, 64 : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + scf.forall (%arg0) = (0) to (2) step (2) { + %5 = vector.load %assume_align[%c0] : memref<2xf32, #hal.descriptor_type>, vector<2xf32> + %6 = math.absf %5 : vector<2xf32> + vector.store %6, %assume_align_0[%c0] : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type>, vector<2xf32> + } {mapping = [#iree_codegen.workgroup_mapping]} + return +} + +// -----// IR Dump After LLVMCPUVectorTransposeLoweringPass (iree-llvmcpu-vector-transpose-lowering) //----- // +func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info} { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %1 = arith.index_castui %0 : i32 to index + %2 = util.assume.int %1[, ] : index + %3 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type> + %assume_align = memref.assume_alignment %3, 64 : memref<2xf32, #hal.descriptor_type> + %4 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + %assume_align_0 = memref.assume_alignment %4, 64 : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + scf.forall (%arg0) = (0) to (2) step (2) { + %5 = vector.load %assume_align[%c0] : memref<2xf32, #hal.descriptor_type>, vector<2xf32> + %6 = math.absf %5 : vector<2xf32> + vector.store %6, %assume_align_0[%c0] : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type>, vector<2xf32> + } {mapping = [#iree_codegen.workgroup_mapping]} + return +} + +// -----// IR Dump After Canonicalizer (canonicalize) //----- // +func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info} { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %1 = arith.index_castui %0 : i32 to index + %2 = util.assume.int %1[, ] : index + %3 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type> + %assume_align = memref.assume_alignment %3, 64 : memref<2xf32, #hal.descriptor_type> + %4 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + %assume_align_0 = memref.assume_alignment %4, 64 : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + scf.forall (%arg0) = (0) to (2) step (2) { + %5 = vector.load %assume_align[%c0] : memref<2xf32, #hal.descriptor_type>, vector<2xf32> + %6 = math.absf %5 : vector<2xf32> + vector.store %6, %assume_align_0[%c0] : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type>, vector<2xf32> + } {mapping = [#iree_codegen.workgroup_mapping]} + return +} + +// -----// IR Dump After LLVMCPUVectorShapeCastLoweringPass (iree-llvmcpu-vector-shape-cast-lowering) //----- // +func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info} { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %1 = arith.index_castui %0 : i32 to index + %2 = util.assume.int %1[, ] : index + %3 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type> + %assume_align = memref.assume_alignment %3, 64 : memref<2xf32, #hal.descriptor_type> + %4 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + %assume_align_0 = memref.assume_alignment %4, 64 : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + scf.forall (%arg0) = (0) to (2) step (2) { + %5 = vector.load %assume_align[%c0] : memref<2xf32, #hal.descriptor_type>, vector<2xf32> + %6 = math.absf %5 : vector<2xf32> + vector.store %6, %assume_align_0[%c0] : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type>, vector<2xf32> + } {mapping = [#iree_codegen.workgroup_mapping]} + return +} + +// -----// IR Dump After LLVMCPULowerExecutableTargetPass (iree-llvmcpu-lower-executable-target) //----- // +func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info} { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %1 = arith.index_castui %0 : i32 to index + %2 = util.assume.int %1[, ] : index + %3 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type> + %assume_align = memref.assume_alignment %3, 64 : memref<2xf32, #hal.descriptor_type> + %4 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + %assume_align_0 = memref.assume_alignment %4, 64 : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + scf.forall (%arg0) = (0) to (2) step (2) { + %5 = vector.load %assume_align[%c0] : memref<2xf32, #hal.descriptor_type>, vector<2xf32> + %6 = math.absf %5 : vector<2xf32> + vector.store %6, %assume_align_0[%c0] : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type>, vector<2xf32> + } {mapping = [#iree_codegen.workgroup_mapping]} + return +} + +// -----// IR Dump After VerifyWorkgroupDistributionPass (iree-codegen-verify-workgroup-distribution) //----- // +func.func @multiple_results_dispatch_0_elementwise_2_f32() attributes {translation_info = #iree_codegen.translation_info} { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %1 = arith.index_castui %0 : i32 to index + %2 = util.assume.int %1[, ] : index + %3 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type> + %assume_align = memref.assume_alignment %3, 64 : memref<2xf32, #hal.descriptor_type> + %4 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + %assume_align_0 = memref.assume_alignment %4, 64 : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + scf.forall (%arg0) = (0) to (2) step (2) { + %5 = vector.load %assume_align[%c0] : memref<2xf32, #hal.descriptor_type>, vector<2xf32> + %6 = math.absf %5 : vector<2xf32> + vector.store %6, %assume_align_0[%c0] : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type>, vector<2xf32> + } {mapping = [#iree_codegen.workgroup_mapping]} + return +} + +// -----// IR Dump After ReconcileTranslationInfoPass (iree-codegen-reconcile-translation-info) //----- // +hal.executable.variant public @embedded_elf_arm_64 target(<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>) { + hal.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 ordinal(0) layout(#hal.pipeline.layout, #hal.pipeline.binding], flags = Indirect>) count(%arg0: !hal.device) -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + hal.return %x, %y, %z : index, index, index + } attributes {workgroup_size = [1 : index, 1 : index, 1 : index]} + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32() { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %1 = arith.index_castui %0 : i32 to index + %2 = util.assume.int %1[, ] : index + %3 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type> + %assume_align = memref.assume_alignment %3, 64 : memref<2xf32, #hal.descriptor_type> + %4 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + %assume_align_0 = memref.assume_alignment %4, 64 : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + %workgroup_id_x = hal.interface.workgroup.id[0] : index + %workgroup_count_x = hal.interface.workgroup.count[0] : index + %5 = affine.apply affine_map<()[s0] -> (s0 * 2)>()[%workgroup_id_x] + %6 = vector.load %assume_align[%c0] : memref<2xf32, #hal.descriptor_type>, vector<2xf32> + %7 = math.absf %6 : vector<2xf32> + vector.store %7, %assume_align_0[%c0] : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type>, vector<2xf32> + iree_codegen.workgroup_count_hint(1) + return + } + } +} + +// -----// IR Dump After ResolveWorkgroupCountHintsPass (iree-codegen-resolve-workgroup-count-hints) //----- // +hal.executable.variant public @embedded_elf_arm_64 target(<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>) { + hal.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 ordinal(0) layout(#hal.pipeline.layout, #hal.pipeline.binding], flags = Indirect>) count(%arg0: !hal.device) -> (index, index, index) { + %c1 = arith.constant 1 : index + %c1_0 = arith.constant 1 : index + %c1_1 = arith.constant 1 : index + hal.return %c1, %c1_0, %c1_1 : index, index, index + } attributes {workgroup_size = [1 : index, 1 : index, 1 : index]} + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32() { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %1 = arith.index_castui %0 : i32 to index + %2 = util.assume.int %1[, ] : index + %3 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type> + %assume_align = memref.assume_alignment %3, 64 : memref<2xf32, #hal.descriptor_type> + %4 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + %assume_align_0 = memref.assume_alignment %4, 64 : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + %workgroup_id_x = hal.interface.workgroup.id[0] : index + %workgroup_count_x = hal.interface.workgroup.count[0] : index + %5 = affine.apply affine_map<()[s0] -> (s0 * 2)>()[%workgroup_id_x] + %6 = vector.load %assume_align[%c0] : memref<2xf32, #hal.descriptor_type>, vector<2xf32> + %7 = math.absf %6 : vector<2xf32> + vector.store %7, %assume_align_0[%c0] : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type>, vector<2xf32> + return + } + } +} + +// -----// IR Dump After IREECodegenLowerAffinePass (iree-codegen-lower-affine) //----- // +hal.executable.variant public @embedded_elf_arm_64 target(<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>) { + hal.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 ordinal(0) layout(#hal.pipeline.layout, #hal.pipeline.binding], flags = Indirect>) count(%arg0: !hal.device) -> (index, index, index) { + %c1 = arith.constant 1 : index + %c1_0 = arith.constant 1 : index + %c1_1 = arith.constant 1 : index + hal.return %c1, %c1_0, %c1_1 : index, index, index + } attributes {workgroup_size = [1 : index, 1 : index, 1 : index]} + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32() { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %1 = arith.index_castui %0 : i32 to index + %2 = util.assume.int %1[, ] : index + %3 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type> + %assume_align = memref.assume_alignment %3, 64 : memref<2xf32, #hal.descriptor_type> + %4 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + %assume_align_0 = memref.assume_alignment %4, 64 : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + %workgroup_id_x = hal.interface.workgroup.id[0] : index + %workgroup_count_x = hal.interface.workgroup.count[0] : index + %c2 = arith.constant 2 : index + %5 = arith.muli %workgroup_id_x, %c2 overflow : index + %6 = vector.load %assume_align[%c0] : memref<2xf32, #hal.descriptor_type>, vector<2xf32> + %7 = math.absf %6 : vector<2xf32> + vector.store %7, %assume_align_0[%c0] : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type>, vector<2xf32> + return + } + } +} + +// -----// IR Dump After DropCompilerHintsPass (iree-util-drop-compiler-hints) //----- // +hal.executable.variant public @embedded_elf_arm_64 target(<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>) { + hal.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 ordinal(0) layout(#hal.pipeline.layout, #hal.pipeline.binding], flags = Indirect>) count(%arg0: !hal.device) -> (index, index, index) { + %c1 = arith.constant 1 : index + %c1_0 = arith.constant 1 : index + %c1_1 = arith.constant 1 : index + hal.return %c1, %c1_0, %c1_1 : index, index, index + } attributes {workgroup_size = [1 : index, 1 : index, 1 : index]} + builtin.module { + func.func @multiple_results_dispatch_0_elementwise_2_f32() { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %1 = arith.index_castui %0 : i32 to index + %2 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32, #hal.descriptor_type> + %assume_align = memref.assume_alignment %2, 64 : memref<2xf32, #hal.descriptor_type> + %3 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%1) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + %assume_align_0 = memref.assume_alignment %3, 64 : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type> + %workgroup_id_x = hal.interface.workgroup.id[0] : index + %workgroup_count_x = hal.interface.workgroup.count[0] : index + %c2 = arith.constant 2 : index + %4 = arith.muli %workgroup_id_x, %c2 overflow : index + %5 = vector.load %assume_align[%c0] : memref<2xf32, #hal.descriptor_type>, vector<2xf32> + %6 = math.absf %5 : vector<2xf32> + vector.store %6, %assume_align_0[%c0] : memref<2xf32, strided<[1], offset: ?>, #hal.descriptor_type>, vector<2xf32> + return + } + } +} + +// -----// IR Dump After EraseHALDescriptorTypeFromMemRefPass (iree-codegen-erase-hal-descriptor-type-from-memref) //----- // +func.func @multiple_results_dispatch_0_elementwise_2_f32() { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %1 = arith.index_castui %0 : i32 to index + %2 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32> + %assume_align = memref.assume_alignment %2, 64 : memref<2xf32> + %3 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%1) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>> + %assume_align_0 = memref.assume_alignment %3, 64 : memref<2xf32, strided<[1], offset: ?>> + %workgroup_id_x = hal.interface.workgroup.id[0] : index + %workgroup_count_x = hal.interface.workgroup.count[0] : index + %c2 = arith.constant 2 : index + %4 = arith.muli %workgroup_id_x, %c2 overflow : index + %5 = vector.load %assume_align[%c0] : memref<2xf32>, vector<2xf32> + %6 = math.absf %5 : vector<2xf32> + vector.store %6, %assume_align_0[%c0] : memref<2xf32, strided<[1], offset: ?>>, vector<2xf32> + return +} + +// -----// IR Dump After LowerUKernelOpsToCallsPass (iree-codegen-lower-ukernel-ops-to-calls) //----- // +module { + func.func @multiple_results_dispatch_0_elementwise_2_f32() { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %1 = arith.index_castui %0 : i32 to index + %2 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32> + %assume_align = memref.assume_alignment %2, 64 : memref<2xf32> + %3 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%1) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>> + %assume_align_0 = memref.assume_alignment %3, 64 : memref<2xf32, strided<[1], offset: ?>> + %workgroup_id_x = hal.interface.workgroup.id[0] : index + %workgroup_count_x = hal.interface.workgroup.count[0] : index + %c2 = arith.constant 2 : index + %4 = arith.muli %workgroup_id_x, %c2 overflow : index + %5 = vector.load %assume_align[%c0] : memref<2xf32>, vector<2xf32> + %6 = math.absf %5 : vector<2xf32> + vector.store %6, %assume_align_0[%c0] : memref<2xf32, strided<[1], offset: ?>>, vector<2xf32> + return + } +} + +// -----// IR Dump After LinalgExtToLoopsPass (iree-linalg-ext-to-loops) //----- // +func.func @multiple_results_dispatch_0_elementwise_2_f32() { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %1 = arith.index_castui %0 : i32 to index + %2 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32> + %assume_align = memref.assume_alignment %2, 64 : memref<2xf32> + %3 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%1) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>> + %assume_align_0 = memref.assume_alignment %3, 64 : memref<2xf32, strided<[1], offset: ?>> + %4 = vector.load %assume_align[%c0] : memref<2xf32>, vector<2xf32> + %5 = math.absf %4 : vector<2xf32> + vector.store %5, %assume_align_0[%c0] : memref<2xf32, strided<[1], offset: ?>>, vector<2xf32> + return +} + +// -----// IR Dump After MemrefCopyToLinalgPass (iree-codegen-memrefcopy-to-linalg) //----- // +func.func @multiple_results_dispatch_0_elementwise_2_f32() { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %1 = arith.index_castui %0 : i32 to index + %2 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32> + %assume_align = memref.assume_alignment %2, 64 : memref<2xf32> + %3 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%1) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>> + %assume_align_0 = memref.assume_alignment %3, 64 : memref<2xf32, strided<[1], offset: ?>> + %4 = vector.load %assume_align[%c0] : memref<2xf32>, vector<2xf32> + %5 = math.absf %4 : vector<2xf32> + vector.store %5, %assume_align_0[%c0] : memref<2xf32, strided<[1], offset: ?>>, vector<2xf32> + return +} + +// -----// IR Dump After ConvertLinalgToLoopsPass (convert-linalg-to-loops) //----- // +func.func @multiple_results_dispatch_0_elementwise_2_f32() { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %1 = arith.index_castui %0 : i32 to index + %2 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32> + %assume_align = memref.assume_alignment %2, 64 : memref<2xf32> + %3 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%1) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>> + %assume_align_0 = memref.assume_alignment %3, 64 : memref<2xf32, strided<[1], offset: ?>> + %4 = vector.load %assume_align[%c0] : memref<2xf32>, vector<2xf32> + %5 = math.absf %4 : vector<2xf32> + vector.store %5, %assume_align_0[%c0] : memref<2xf32, strided<[1], offset: ?>>, vector<2xf32> + return +} + +// -----// IR Dump After ConvertBf16ArithToF32Pass (iree-convert-bf16-arith-to-f32) //----- // +func.func @multiple_results_dispatch_0_elementwise_2_f32() { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %1 = arith.index_castui %0 : i32 to index + %2 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32> + %assume_align = memref.assume_alignment %2, 64 : memref<2xf32> + %3 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%1) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>> + %assume_align_0 = memref.assume_alignment %3, 64 : memref<2xf32, strided<[1], offset: ?>> + %4 = vector.load %assume_align[%c0] : memref<2xf32>, vector<2xf32> + %5 = math.absf %4 : vector<2xf32> + vector.store %5, %assume_align_0[%c0] : memref<2xf32, strided<[1], offset: ?>>, vector<2xf32> + return +} + +// -----// IR Dump After ConvertBf16ToUInt16BuffersPass (iree-codegen-convert-bf16-to-uint16-buffers) //----- // +func.func @multiple_results_dispatch_0_elementwise_2_f32() { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %1 = arith.index_castui %0 : i32 to index + %2 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32> + %assume_align = memref.assume_alignment %2, 64 : memref<2xf32> + %3 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%1) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>> + %assume_align_0 = memref.assume_alignment %3, 64 : memref<2xf32, strided<[1], offset: ?>> + %4 = vector.load %assume_align[%c0] : memref<2xf32>, vector<2xf32> + %5 = math.absf %4 : vector<2xf32> + vector.store %5, %assume_align_0[%c0] : memref<2xf32, strided<[1], offset: ?>>, vector<2xf32> + return +} + +// -----// IR Dump After Canonicalizer (canonicalize) //----- // +func.func @multiple_results_dispatch_0_elementwise_2_f32() { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %1 = arith.index_castui %0 : i32 to index + %2 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32> + %assume_align = memref.assume_alignment %2, 64 : memref<2xf32> + %3 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%1) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>> + %assume_align_0 = memref.assume_alignment %3, 64 : memref<2xf32, strided<[1], offset: ?>> + %4 = vector.load %assume_align[%c0] : memref<2xf32>, vector<2xf32> + %5 = math.absf %4 : vector<2xf32> + vector.store %5, %assume_align_0[%c0] : memref<2xf32, strided<[1], offset: ?>>, vector<2xf32> + return +} + +// -----// IR Dump After CSE (cse) //----- // +func.func @multiple_results_dispatch_0_elementwise_2_f32() { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %1 = arith.index_castui %0 : i32 to index + %2 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32> + %assume_align = memref.assume_alignment %2, 64 : memref<2xf32> + %3 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%1) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>> + %assume_align_0 = memref.assume_alignment %3, 64 : memref<2xf32, strided<[1], offset: ?>> + %4 = vector.load %assume_align[%c0] : memref<2xf32>, vector<2xf32> + %5 = math.absf %4 : vector<2xf32> + vector.store %5, %assume_align_0[%c0] : memref<2xf32, strided<[1], offset: ?>>, vector<2xf32> + return +} + +// -----// IR Dump After IREEBufferizeConstantsPass (iree-codegen-iree-bufferize-constants) //----- // +module { + func.func @multiple_results_dispatch_0_elementwise_2_f32() { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %1 = arith.index_castui %0 : i32 to index + %2 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32> + %assume_align = memref.assume_alignment %2, 64 : memref<2xf32> + %3 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%1) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>> + %assume_align_0 = memref.assume_alignment %3, 64 : memref<2xf32, strided<[1], offset: ?>> + %4 = vector.load %assume_align[%c0] : memref<2xf32>, vector<2xf32> + %5 = math.absf %4 : vector<2xf32> + vector.store %5, %assume_align_0[%c0] : memref<2xf32, strided<[1], offset: ?>>, vector<2xf32> + return + } +} + +// -----// IR Dump After FoldTensorExtractOpPass (iree-codegen-fold-tensor-extract-op) //----- // +func.func @multiple_results_dispatch_0_elementwise_2_f32() { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %1 = arith.index_castui %0 : i32 to index + %2 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32> + %assume_align = memref.assume_alignment %2, 64 : memref<2xf32> + %3 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%1) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>> + %assume_align_0 = memref.assume_alignment %3, 64 : memref<2xf32, strided<[1], offset: ?>> + %4 = vector.load %assume_align[%c0] : memref<2xf32>, vector<2xf32> + %5 = math.absf %4 : vector<2xf32> + vector.store %5, %assume_align_0[%c0] : memref<2xf32, strided<[1], offset: ?>>, vector<2xf32> + return +} + +// -----// IR Dump After ConvertComplexToStandardPass (convert-complex-to-standard) //----- // +func.func @multiple_results_dispatch_0_elementwise_2_f32() { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %1 = arith.index_castui %0 : i32 to index + %2 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32> + %assume_align = memref.assume_alignment %2, 64 : memref<2xf32> + %3 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%1) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>> + %assume_align_0 = memref.assume_alignment %3, 64 : memref<2xf32, strided<[1], offset: ?>> + %4 = vector.load %assume_align[%c0] : memref<2xf32>, vector<2xf32> + %5 = math.absf %4 : vector<2xf32> + vector.store %5, %assume_align_0[%c0] : memref<2xf32, strided<[1], offset: ?>>, vector<2xf32> + return +} + +// -----// IR Dump After MathTransformPass (iree-codegen-math-transform) //----- // +func.func @multiple_results_dispatch_0_elementwise_2_f32() { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %1 = arith.index_castui %0 : i32 to index + %2 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32> + %assume_align = memref.assume_alignment %2, 64 : memref<2xf32> + %3 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%1) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>> + %assume_align_0 = memref.assume_alignment %3, 64 : memref<2xf32, strided<[1], offset: ?>> + %4 = vector.load %assume_align[%c0] : memref<2xf32>, vector<2xf32> + %5 = math.absf %4 : vector<2xf32> + vector.store %5, %assume_align_0[%c0] : memref<2xf32, strided<[1], offset: ?>>, vector<2xf32> + return +} + +// -----// IR Dump After HoistStaticallyBoundAllocationsPass (iree-codegen-hoist-statically-bound-allocations) //----- // +func.func @multiple_results_dispatch_0_elementwise_2_f32() { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %1 = arith.index_castui %0 : i32 to index + %2 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32> + %assume_align = memref.assume_alignment %2, 64 : memref<2xf32> + %3 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%1) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>> + %assume_align_0 = memref.assume_alignment %3, 64 : memref<2xf32, strided<[1], offset: ?>> + %4 = vector.load %assume_align[%c0] : memref<2xf32>, vector<2xf32> + %5 = math.absf %4 : vector<2xf32> + vector.store %5, %assume_align_0[%c0] : memref<2xf32, strided<[1], offset: ?>>, vector<2xf32> + return +} + +// -----// IR Dump After VectorTransferLoweringPass (iree-codegen-vector-transfer-lowering) //----- // +func.func @multiple_results_dispatch_0_elementwise_2_f32() { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %1 = arith.index_castui %0 : i32 to index + %2 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32> + %assume_align = memref.assume_alignment %2, 64 : memref<2xf32> + %3 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%1) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>> + %assume_align_0 = memref.assume_alignment %3, 64 : memref<2xf32, strided<[1], offset: ?>> + %4 = vector.load %assume_align[%c0] : memref<2xf32>, vector<2xf32> + %5 = math.absf %4 : vector<2xf32> + vector.store %5, %assume_align_0[%c0] : memref<2xf32, strided<[1], offset: ?>>, vector<2xf32> + return +} + +// -----// IR Dump After FoldMemRefAliasOpsPass (fold-memref-alias-ops) //----- // +func.func @multiple_results_dispatch_0_elementwise_2_f32() { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %1 = arith.index_castui %0 : i32 to index + %2 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32> + %assume_align = memref.assume_alignment %2, 64 : memref<2xf32> + %3 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%1) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>> + %assume_align_0 = memref.assume_alignment %3, 64 : memref<2xf32, strided<[1], offset: ?>> + %4 = vector.load %assume_align[%c0] : memref<2xf32>, vector<2xf32> + %5 = math.absf %4 : vector<2xf32> + vector.store %5, %assume_align_0[%c0] : memref<2xf32, strided<[1], offset: ?>>, vector<2xf32> + return +} + +// -----// IR Dump After IREEExpandStridedMetadataPass (iree-codegen-expand-strided-metadata) //----- // +func.func @multiple_results_dispatch_0_elementwise_2_f32() { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %1 = arith.index_castui %0 : i32 to index + %2 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32> + %assume_align = memref.assume_alignment %2, 64 : memref<2xf32> + %3 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%1) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>> + %assume_align_0 = memref.assume_alignment %3, 64 : memref<2xf32, strided<[1], offset: ?>> + %4 = vector.load %assume_align[%c0] : memref<2xf32>, vector<2xf32> + %5 = math.absf %4 : vector<2xf32> + vector.store %5, %assume_align_0[%c0] : memref<2xf32, strided<[1], offset: ?>>, vector<2xf32> + return +} + +// -----// IR Dump After CleanupBufferAllocViewPass (iree-codegen-cleanup-buffer-alloc-view) //----- // +func.func @multiple_results_dispatch_0_elementwise_2_f32() { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %1 = arith.index_castui %0 : i32 to index + %2 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32> + %assume_align = memref.assume_alignment %2, 64 : memref<2xf32> + %3 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%1) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>> + %assume_align_0 = memref.assume_alignment %3, 64 : memref<2xf32, strided<[1], offset: ?>> + %4 = vector.load %assume_align[%c0] : memref<2xf32>, vector<2xf32> + %5 = math.absf %4 : vector<2xf32> + vector.store %5, %assume_align_0[%c0] : memref<2xf32, strided<[1], offset: ?>>, vector<2xf32> + return +} + +// -----// IR Dump After LLVMCPUCheckIRBeforeLLVMConversionPass (iree-llvmcpu-check-ir-before-llvm-conversion) //----- // +func.func @multiple_results_dispatch_0_elementwise_2_f32() { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %1 = arith.index_castui %0 : i32 to index + %2 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32> + %assume_align = memref.assume_alignment %2, 64 : memref<2xf32> + %3 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%1) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>> + %assume_align_0 = memref.assume_alignment %3, 64 : memref<2xf32, strided<[1], offset: ?>> + %4 = vector.load %assume_align[%c0] : memref<2xf32>, vector<2xf32> + %5 = math.absf %4 : vector<2xf32> + vector.store %5, %assume_align_0[%c0] : memref<2xf32, strided<[1], offset: ?>>, vector<2xf32> + return +} + +// -----// IR Dump After SCFToControlFlowPass (convert-scf-to-cf) //----- // +func.func @multiple_results_dispatch_0_elementwise_2_f32() { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %1 = arith.index_castui %0 : i32 to index + %2 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32> + %assume_align = memref.assume_alignment %2, 64 : memref<2xf32> + %3 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%1) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>> + %assume_align_0 = memref.assume_alignment %3, 64 : memref<2xf32, strided<[1], offset: ?>> + %4 = vector.load %assume_align[%c0] : memref<2xf32>, vector<2xf32> + %5 = math.absf %4 : vector<2xf32> + vector.store %5, %assume_align_0[%c0] : memref<2xf32, strided<[1], offset: ?>>, vector<2xf32> + return +} + +// -----// IR Dump After Canonicalizer (canonicalize) //----- // +func.func @multiple_results_dispatch_0_elementwise_2_f32() { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %1 = arith.index_castui %0 : i32 to index + %2 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32> + %assume_align = memref.assume_alignment %2, 64 : memref<2xf32> + %3 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%1) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>> + %assume_align_0 = memref.assume_alignment %3, 64 : memref<2xf32, strided<[1], offset: ?>> + %4 = vector.load %assume_align[%c0] : memref<2xf32>, vector<2xf32> + %5 = math.absf %4 : vector<2xf32> + vector.store %5, %assume_align_0[%c0] : memref<2xf32, strided<[1], offset: ?>>, vector<2xf32> + return +} + +// -----// IR Dump After CSE (cse) //----- // +func.func @multiple_results_dispatch_0_elementwise_2_f32() { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %1 = arith.index_castui %0 : i32 to index + %2 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32> + %assume_align = memref.assume_alignment %2, 64 : memref<2xf32> + %3 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%1) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>> + %assume_align_0 = memref.assume_alignment %3, 64 : memref<2xf32, strided<[1], offset: ?>> + %4 = vector.load %assume_align[%c0] : memref<2xf32>, vector<2xf32> + %5 = math.absf %4 : vector<2xf32> + vector.store %5, %assume_align_0[%c0] : memref<2xf32, strided<[1], offset: ?>>, vector<2xf32> + return +} + +// -----// IR Dump After FoldMemRefAliasOpsPass (fold-memref-alias-ops) //----- // +func.func @multiple_results_dispatch_0_elementwise_2_f32() { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %1 = arith.index_castui %0 : i32 to index + %2 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32> + %assume_align = memref.assume_alignment %2, 64 : memref<2xf32> + %3 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%1) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>> + %assume_align_0 = memref.assume_alignment %3, 64 : memref<2xf32, strided<[1], offset: ?>> + %4 = vector.load %assume_align[%c0] : memref<2xf32>, vector<2xf32> + %5 = math.absf %4 : vector<2xf32> + vector.store %5, %assume_align_0[%c0] : memref<2xf32, strided<[1], offset: ?>>, vector<2xf32> + return +} + +// -----// IR Dump After IREECodegenAffineExpandIndexOpsPass (iree-codegen-affine-expand-index-ops) //----- // +func.func @multiple_results_dispatch_0_elementwise_2_f32() { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %1 = arith.index_castui %0 : i32 to index + %2 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32> + %assume_align = memref.assume_alignment %2, 64 : memref<2xf32> + %3 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%1) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>> + %assume_align_0 = memref.assume_alignment %3, 64 : memref<2xf32, strided<[1], offset: ?>> + %4 = vector.load %assume_align[%c0] : memref<2xf32>, vector<2xf32> + %5 = math.absf %4 : vector<2xf32> + vector.store %5, %assume_align_0[%c0] : memref<2xf32, strided<[1], offset: ?>>, vector<2xf32> + return +} + +// -----// IR Dump After ArithExpandOpsPass (arith-expand) //----- // +func.func @multiple_results_dispatch_0_elementwise_2_f32() { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %1 = arith.index_castui %0 : i32 to index + %2 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32> + %assume_align = memref.assume_alignment %2, 64 : memref<2xf32> + %3 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%1) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>> + %assume_align_0 = memref.assume_alignment %3, 64 : memref<2xf32, strided<[1], offset: ?>> + %4 = vector.load %assume_align[%c0] : memref<2xf32>, vector<2xf32> + %5 = math.absf %4 : vector<2xf32> + vector.store %5, %assume_align_0[%c0] : memref<2xf32, strided<[1], offset: ?>>, vector<2xf32> + return +} + +// -----// IR Dump After EmulateNarrowTypePass (iree-codegen-emulate-narrow-type) //----- // +func.func @multiple_results_dispatch_0_elementwise_2_f32() { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %1 = arith.index_castui %0 : i32 to index + %2 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32> + %assume_align = memref.assume_alignment %2, 64 : memref<2xf32> + %3 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%1) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>> + %assume_align_0 = memref.assume_alignment %3, 64 : memref<2xf32, strided<[1], offset: ?>> + %4 = vector.load %assume_align[%c0] : memref<2xf32>, vector<2xf32> + %5 = math.absf %4 : vector<2xf32> + vector.store %5, %assume_align_0[%c0] : memref<2xf32, strided<[1], offset: ?>>, vector<2xf32> + return +} + +// -----// IR Dump After Canonicalizer (canonicalize) //----- // +func.func @multiple_results_dispatch_0_elementwise_2_f32() { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %1 = arith.index_castui %0 : i32 to index + %2 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32> + %assume_align = memref.assume_alignment %2, 64 : memref<2xf32> + %3 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%1) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>> + %assume_align_0 = memref.assume_alignment %3, 64 : memref<2xf32, strided<[1], offset: ?>> + %4 = vector.load %assume_align[%c0] : memref<2xf32>, vector<2xf32> + %5 = math.absf %4 : vector<2xf32> + vector.store %5, %assume_align_0[%c0] : memref<2xf32, strided<[1], offset: ?>>, vector<2xf32> + return +} + +// -----// IR Dump After CSE (cse) //----- // +func.func @multiple_results_dispatch_0_elementwise_2_f32() { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 + %1 = arith.index_castui %0 : i32 to index + %2 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2xf32> + %assume_align = memref.assume_alignment %2, 64 : memref<2xf32> + %3 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%1) flags(Indirect) : memref<2xf32, strided<[1], offset: ?>> + %assume_align_0 = memref.assume_alignment %3, 64 : memref<2xf32, strided<[1], offset: ?>> + %4 = vector.load %assume_align[%c0] : memref<2xf32>, vector<2xf32> + %5 = math.absf %4 : vector<2xf32> + vector.store %5, %assume_align_0[%c0] : memref<2xf32, strided<[1], offset: ?>>, vector<2xf32> + return +} + +// -----// IR Dump After ConvertToLLVMPass (iree-convert-to-llvm) //----- // +module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", llvm.target_triple = "arm64-unknown-unknown-eabi-elf"} { + llvm.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}) -> i32 { + %0 = llvm.mlir.constant(0 : i32) : i32 + %1 = llvm.mlir.constant(8 : i64) : i64 + %2 = llvm.mlir.constant(32 : i64) : i64 + %3 = llvm.mlir.constant(64 : index) : i64 + %4 = llvm.mlir.constant(true) : i1 + %5 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %6 = llvm.extractvalue %5[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %7 = llvm.load %6 : !llvm.ptr -> i32 + %8 = llvm.zext %7 : i32 to i64 + %9 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %10 = llvm.extractvalue %9[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %11 = llvm.load %10 : !llvm.ptr -> !llvm.ptr + llvm.intr.assume %4 ["align"(%11, %3 : !llvm.ptr, i64)] : i1 + %12 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %13 = llvm.extractvalue %12[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %14 = llvm.getelementptr %13[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr + %15 = llvm.load %14 : !llvm.ptr -> !llvm.ptr + %16 = llvm.mul %8, %1 : i64 + %17 = llvm.udiv %16, %2 : i64 + %18 = llvm.getelementptr %15[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.intr.assume %4 ["align"(%18, %3 : !llvm.ptr, i64)] : i1 + %19 = llvm.load %11 {alignment = 4 : i64} : !llvm.ptr -> vector<2xf32> + %20 = llvm.intr.fabs(%19) : (vector<2xf32>) -> vector<2xf32> + %21 = llvm.getelementptr %15[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.store %20, %21 {alignment = 4 : i64} : vector<2xf32>, !llvm.ptr + llvm.return %0 : i32 + } +} + +// -----// IR Dump After ReconcileUnrealizedCastsPass (reconcile-unrealized-casts) //----- // +module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", llvm.target_triple = "arm64-unknown-unknown-eabi-elf"} { + llvm.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}) -> i32 { + %0 = llvm.mlir.constant(0 : i32) : i32 + %1 = llvm.mlir.constant(8 : i64) : i64 + %2 = llvm.mlir.constant(32 : i64) : i64 + %3 = llvm.mlir.constant(64 : index) : i64 + %4 = llvm.mlir.constant(true) : i1 + %5 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %6 = llvm.extractvalue %5[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %7 = llvm.load %6 : !llvm.ptr -> i32 + %8 = llvm.zext %7 : i32 to i64 + %9 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %10 = llvm.extractvalue %9[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %11 = llvm.load %10 : !llvm.ptr -> !llvm.ptr + llvm.intr.assume %4 ["align"(%11, %3 : !llvm.ptr, i64)] : i1 + %12 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %13 = llvm.extractvalue %12[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %14 = llvm.getelementptr %13[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr + %15 = llvm.load %14 : !llvm.ptr -> !llvm.ptr + %16 = llvm.mul %8, %1 : i64 + %17 = llvm.udiv %16, %2 : i64 + %18 = llvm.getelementptr %15[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.intr.assume %4 ["align"(%18, %3 : !llvm.ptr, i64)] : i1 + %19 = llvm.load %11 {alignment = 4 : i64} : !llvm.ptr -> vector<2xf32> + %20 = llvm.intr.fabs(%19) : (vector<2xf32>) -> vector<2xf32> + %21 = llvm.getelementptr %15[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.store %20, %21 {alignment = 4 : i64} : vector<2xf32>, !llvm.ptr + llvm.return %0 : i32 + } +} + +// -----// IR Dump After LLVMCPUSynchronizeSymbolVisibilityPass (iree-llvmcpu-synchronize-symbol-visibility) //----- // +module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", llvm.target_triple = "arm64-unknown-unknown-eabi-elf"} { + llvm.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}) -> i32 { + %0 = llvm.mlir.constant(0 : i32) : i32 + %1 = llvm.mlir.constant(8 : i64) : i64 + %2 = llvm.mlir.constant(32 : i64) : i64 + %3 = llvm.mlir.constant(64 : index) : i64 + %4 = llvm.mlir.constant(true) : i1 + %5 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %6 = llvm.extractvalue %5[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %7 = llvm.load %6 : !llvm.ptr -> i32 + %8 = llvm.zext %7 : i32 to i64 + %9 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %10 = llvm.extractvalue %9[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %11 = llvm.load %10 : !llvm.ptr -> !llvm.ptr + llvm.intr.assume %4 ["align"(%11, %3 : !llvm.ptr, i64)] : i1 + %12 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %13 = llvm.extractvalue %12[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %14 = llvm.getelementptr %13[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr + %15 = llvm.load %14 : !llvm.ptr -> !llvm.ptr + %16 = llvm.mul %8, %1 : i64 + %17 = llvm.udiv %16, %2 : i64 + %18 = llvm.getelementptr %15[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.intr.assume %4 ["align"(%18, %3 : !llvm.ptr, i64)] : i1 + %19 = llvm.load %11 {alignment = 4 : i64} : !llvm.ptr -> vector<2xf32> + %20 = llvm.intr.fabs(%19) : (vector<2xf32>) -> vector<2xf32> + %21 = llvm.getelementptr %15[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.store %20, %21 {alignment = 4 : i64} : vector<2xf32>, !llvm.ptr + llvm.return %0 : i32 + } +} + +// -----// IR Dump After Canonicalizer (canonicalize) //----- // +module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", llvm.target_triple = "arm64-unknown-unknown-eabi-elf"} { + llvm.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}) -> i32 { + %0 = llvm.mlir.constant(0 : i32) : i32 + %1 = llvm.mlir.constant(8 : i64) : i64 + %2 = llvm.mlir.constant(32 : i64) : i64 + %3 = llvm.mlir.constant(64 : index) : i64 + %4 = llvm.mlir.constant(true) : i1 + %5 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %6 = llvm.extractvalue %5[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %7 = llvm.load %6 : !llvm.ptr -> i32 + %8 = llvm.zext %7 : i32 to i64 + %9 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %10 = llvm.extractvalue %9[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %11 = llvm.load %10 : !llvm.ptr -> !llvm.ptr + llvm.intr.assume %4 ["align"(%11, %3 : !llvm.ptr, i64)] : i1 + %12 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %13 = llvm.extractvalue %12[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %14 = llvm.getelementptr %13[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr + %15 = llvm.load %14 : !llvm.ptr -> !llvm.ptr + %16 = llvm.mul %8, %1 : i64 + %17 = llvm.udiv %16, %2 : i64 + %18 = llvm.getelementptr %15[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.intr.assume %4 ["align"(%18, %3 : !llvm.ptr, i64)] : i1 + %19 = llvm.load %11 {alignment = 4 : i64} : !llvm.ptr -> vector<2xf32> + %20 = llvm.intr.fabs(%19) : (vector<2xf32>) -> vector<2xf32> + %21 = llvm.getelementptr %15[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.store %20, %21 {alignment = 4 : i64} : vector<2xf32>, !llvm.ptr + llvm.return %0 : i32 + } +} + +// -----// IR Dump After CSE (cse) //----- // +module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", llvm.target_triple = "arm64-unknown-unknown-eabi-elf"} { + llvm.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}) -> i32 { + %0 = llvm.mlir.constant(0 : i32) : i32 + %1 = llvm.mlir.constant(8 : i64) : i64 + %2 = llvm.mlir.constant(32 : i64) : i64 + %3 = llvm.mlir.constant(64 : index) : i64 + %4 = llvm.mlir.constant(true) : i1 + %5 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %6 = llvm.extractvalue %5[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %7 = llvm.load %6 : !llvm.ptr -> i32 + %8 = llvm.zext %7 : i32 to i64 + %9 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %10 = llvm.load %9 : !llvm.ptr -> !llvm.ptr + llvm.intr.assume %4 ["align"(%10, %3 : !llvm.ptr, i64)] : i1 + %11 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %12 = llvm.extractvalue %11[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %13 = llvm.getelementptr %12[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr + %14 = llvm.load %13 : !llvm.ptr -> !llvm.ptr + %15 = llvm.mul %8, %1 : i64 + %16 = llvm.udiv %15, %2 : i64 + %17 = llvm.getelementptr %14[%16] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.intr.assume %4 ["align"(%17, %3 : !llvm.ptr, i64)] : i1 + %18 = llvm.load %10 {alignment = 4 : i64} : !llvm.ptr -> vector<2xf32> + %19 = llvm.intr.fabs(%18) : (vector<2xf32>) -> vector<2xf32> + llvm.store %19, %17 {alignment = 4 : i64} : vector<2xf32>, !llvm.ptr + llvm.return %0 : i32 + } +} + +// -----// IR Dump After AddFastMathFlagsPass (iree-codegen-add-fast-math-flags) //----- // +llvm.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}) -> i32 { + %0 = llvm.mlir.constant(0 : i32) : i32 + %1 = llvm.mlir.constant(8 : i64) : i64 + %2 = llvm.mlir.constant(32 : i64) : i64 + %3 = llvm.mlir.constant(64 : index) : i64 + %4 = llvm.mlir.constant(true) : i1 + %5 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %6 = llvm.extractvalue %5[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %7 = llvm.load %6 : !llvm.ptr -> i32 + %8 = llvm.zext %7 : i32 to i64 + %9 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %10 = llvm.load %9 : !llvm.ptr -> !llvm.ptr + llvm.intr.assume %4 ["align"(%10, %3 : !llvm.ptr, i64)] : i1 + %11 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %12 = llvm.extractvalue %11[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %13 = llvm.getelementptr %12[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr + %14 = llvm.load %13 : !llvm.ptr -> !llvm.ptr + %15 = llvm.mul %8, %1 : i64 + %16 = llvm.udiv %15, %2 : i64 + %17 = llvm.getelementptr %14[%16] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.intr.assume %4 ["align"(%17, %3 : !llvm.ptr, i64)] : i1 + %18 = llvm.load %10 {alignment = 4 : i64} : !llvm.ptr -> vector<2xf32> + %19 = llvm.intr.fabs(%18) : (vector<2xf32>) -> vector<2xf32> + llvm.store %19, %17 {alignment = 4 : i64} : vector<2xf32>, !llvm.ptr + llvm.return %0 : i32 +} + +// -----// IR Dump After TranslateTargetExecutableVariantsPass (iree-hal-translate-target-executable-variants) //----- // +hal.executable.variant public @embedded_elf_arm_64 target(<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>) { + hal.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 ordinal(0) layout(#hal.pipeline.layout, #hal.pipeline.binding], flags = Indirect>) count(%arg0: !hal.device) -> (index, index, index) { + %c1 = arith.constant 1 : index + %c1_0 = arith.constant 1 : index + %c1_1 = arith.constant 1 : index + hal.return %c1, %c1_0, %c1_1 : index, index, index + } attributes {workgroup_size = [1 : index, 1 : index, 1 : index]} + builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", llvm.target_triple = "arm64-unknown-unknown-eabi-elf"} { + llvm.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}) -> i32 { + %0 = llvm.mlir.constant(0 : i32) : i32 + %1 = llvm.mlir.constant(8 : i64) : i64 + %2 = llvm.mlir.constant(32 : i64) : i64 + %3 = llvm.mlir.constant(64 : index) : i64 + %4 = llvm.mlir.constant(true) : i1 + %5 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %6 = llvm.extractvalue %5[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %7 = llvm.load %6 : !llvm.ptr -> i32 + %8 = llvm.zext %7 : i32 to i64 + %9 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %10 = llvm.load %9 : !llvm.ptr -> !llvm.ptr + llvm.intr.assume %4 ["align"(%10, %3 : !llvm.ptr, i64)] : i1 + %11 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %12 = llvm.extractvalue %11[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %13 = llvm.getelementptr %12[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr + %14 = llvm.load %13 : !llvm.ptr -> !llvm.ptr + %15 = llvm.mul %8, %1 : i64 + %16 = llvm.udiv %15, %2 : i64 + %17 = llvm.getelementptr %14[%16] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.intr.assume %4 ["align"(%17, %3 : !llvm.ptr, i64)] : i1 + %18 = llvm.load %10 {alignment = 4 : i64} : !llvm.ptr -> vector<2xf32> + %19 = llvm.intr.fabs(%18) : (vector<2xf32>) -> vector<2xf32> + llvm.store %19, %17 {alignment = 4 : i64} : vector<2xf32>, !llvm.ptr + llvm.return %0 : i32 + } + } +} + +// -----// IR Dump After TranslateAllExecutablesPass (iree-hal-translate-all-executables) //----- // +hal.executable private @multiple_results_dispatch_0 { + hal.executable.variant public @embedded_elf_arm_64 target(<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>) { + hal.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 ordinal(0) layout(#hal.pipeline.layout, #hal.pipeline.binding], flags = Indirect>) count(%arg0: !hal.device) -> (index, index, index) { + %c1 = arith.constant 1 : index + %c1_0 = arith.constant 1 : index + %c1_1 = arith.constant 1 : index + hal.return %c1, %c1_0, %c1_1 : index, index, index + } attributes {workgroup_size = [1 : index, 1 : index, 1 : index]} + builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", llvm.target_triple = "arm64-unknown-unknown-eabi-elf"} { + llvm.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}) -> i32 { + %0 = llvm.mlir.constant(0 : i32) : i32 + %1 = llvm.mlir.constant(8 : i64) : i64 + %2 = llvm.mlir.constant(32 : i64) : i64 + %3 = llvm.mlir.constant(64 : index) : i64 + %4 = llvm.mlir.constant(true) : i1 + %5 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %6 = llvm.extractvalue %5[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %7 = llvm.load %6 : !llvm.ptr -> i32 + %8 = llvm.zext %7 : i32 to i64 + %9 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %10 = llvm.load %9 : !llvm.ptr -> !llvm.ptr + llvm.intr.assume %4 ["align"(%10, %3 : !llvm.ptr, i64)] : i1 + %11 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %12 = llvm.extractvalue %11[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %13 = llvm.getelementptr %12[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr + %14 = llvm.load %13 : !llvm.ptr -> !llvm.ptr + %15 = llvm.mul %8, %1 : i64 + %16 = llvm.udiv %15, %2 : i64 + %17 = llvm.getelementptr %14[%16] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.intr.assume %4 ["align"(%17, %3 : !llvm.ptr, i64)] : i1 + %18 = llvm.load %10 {alignment = 4 : i64} : !llvm.ptr -> vector<2xf32> + %19 = llvm.intr.fabs(%18) : (vector<2xf32>) -> vector<2xf32> + llvm.store %19, %17 {alignment = 4 : i64} : vector<2xf32>, !llvm.ptr + llvm.return %0 : i32 + } + } + } +} + +// -----// IR Dump After ConvertToHALPass (iree-hal-conversion) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding], flags = Indirect> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module { + util.global private @__device_0 = #device_target_local + hal.executable private @multiple_results_dispatch_0 { + hal.executable.variant public @embedded_elf_arm_64 target(#executable_target_embedded_elf_arm_64) { + hal.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 ordinal(0) layout(#pipeline_layout) attributes {workgroup_size = [1 : index, 1 : index, 1 : index]} + builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", llvm.target_triple = "arm64-unknown-unknown-eabi-elf"} { + llvm.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}) -> i32 { + %0 = llvm.mlir.constant(0 : i32) : i32 + %1 = llvm.mlir.constant(8 : i64) : i64 + %2 = llvm.mlir.constant(32 : i64) : i64 + %3 = llvm.mlir.constant(64 : index) : i64 + %4 = llvm.mlir.constant(true) : i1 + %5 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %6 = llvm.extractvalue %5[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %7 = llvm.load %6 : !llvm.ptr -> i32 + %8 = llvm.zext %7 : i32 to i64 + %9 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %10 = llvm.load %9 : !llvm.ptr -> !llvm.ptr + llvm.intr.assume %4 ["align"(%10, %3 : !llvm.ptr, i64)] : i1 + %11 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %12 = llvm.extractvalue %11[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %13 = llvm.getelementptr %12[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr + %14 = llvm.load %13 : !llvm.ptr -> !llvm.ptr + %15 = llvm.mul %8, %1 : i64 + %16 = llvm.udiv %15, %2 : i64 + %17 = llvm.getelementptr %14[%16] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.intr.assume %4 ["align"(%17, %3 : !llvm.ptr, i64)] : i1 + %18 = llvm.load %10 {alignment = 4 : i64} : !llvm.ptr -> vector<2xf32> + %19 = llvm.intr.fabs(%18) : (vector<2xf32>) -> vector<2xf32> + llvm.store %19, %17 {alignment = 4 : i64} : vector<2xf32>, !llvm.ptr + llvm.return %0 : i32 + } + } + } + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c64_i32 = arith.constant 64 : i32 + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator + hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer + %__device_0_1 = util.global.load immutable @__device_0 : !hal.device + %allocator_2 = hal.device.allocator<%__device_0_1 : !hal.device> : !hal.allocator + hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator_2 : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + %memory_types, %buffer_usage = hal.allocator.resolve_memory_properties for(#hal.device.affinity<@__device_0>) lifetime(external) : i32, i32 + %__device_0_3 = util.global.load immutable @__device_0 : !hal.device + %c-1_i64 = arith.constant -1 : i64 + %0 = util.null : !hal.fence + %fence = hal.fence.create device(%__device_0_3 : !hal.device) flags("None") : !hal.fence + %c0_i64 = arith.constant 0 : i64 + %transient_buffer = hal.device.queue.alloca<%__device_0_3 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_types) usage(%buffer_usage) flags("None") : !hal.buffer{%c128} + %__device_0_4 = util.global.load immutable @__device_0 : !hal.device + %c-1_i64_5 = arith.constant -1 : i64 + %c0_6 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2_7 = arith.constant 2 : index + %1 = hal.device.memoize<%__device_0_4 : !hal.device> affinity(%c-1_i64_5) -> !hal.command_buffer { + %c3 = arith.constant 3 : index + %cmd = hal.command_buffer.create device(%__device_0_4 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64_5) bindings(%c3) : !hal.command_buffer + %2 = hal.command_buffer.device<%cmd : !hal.command_buffer> : !hal.device + %exe = hal.executable.lookup device(%2 : !hal.device) executable(@multiple_results_dispatch_0) : !hal.executable + %ordinal = hal.executable.export.ordinal target(@multiple_results_dispatch_0::@embedded_elf_arm_64::@multiple_results_dispatch_0_elementwise_2_f32) : index + %c1_20 = arith.constant 1 : index + %c1_21 = arith.constant 1 : index + %c1_22 = arith.constant 1 : index + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c1_20, %c1_21, %c1_22]) constants([%c0_i32]) bindings([ + (%c0_6 : index)[%c0, %c8], + (%c2_7 : index)[%c0, %c128] + ]) flags("None") + %3 = hal.command_buffer.device<%cmd : !hal.command_buffer> : !hal.device + %exe_23 = hal.executable.lookup device(%3 : !hal.device) executable(@multiple_results_dispatch_0) : !hal.executable + %ordinal_24 = hal.executable.export.ordinal target(@multiple_results_dispatch_0::@embedded_elf_arm_64::@multiple_results_dispatch_0_elementwise_2_f32) : index + %c1_25 = arith.constant 1 : index + %c1_26 = arith.constant 1 : index + %c1_27 = arith.constant 1 : index + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe_23 : !hal.executable)[%ordinal_24] workgroups([%c1_25, %c1_26, %c1_27]) constants([%c64_i32]) bindings([ + (%c1 : index)[%c0, %c8], + (%c2_7 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") + hal.command_buffer.finalize<%cmd : !hal.command_buffer> + hal.return %cmd : !hal.command_buffer + } + %fence_8 = hal.fence.create device(%__device_0_4 : !hal.device) flags("None") : !hal.fence + hal.device.queue.execute.indirect<%__device_0_4 : !hal.device> affinity(%c-1_i64_5) wait(%fence) signal(%fence_8) commands(%1) bindings([ + (%buffer : !hal.buffer)[%c0_6, %c8], + (%buffer_0 : !hal.buffer)[%c0_6, %c8], + (%transient_buffer : !hal.buffer)[%c0_6, %c128] + ]) flags("None") + %c-1_i32 = arith.constant -1 : i32 + %status = hal.fence.await until([%fence_8]) timeout_millis(%c-1_i32) flags("None") : i32 + util.status.check_ok %status, "failed to wait on timepoint" + %buffer_9 = hal.buffer.subspan<%transient_buffer : !hal.buffer>[%c0, %c8] : !hal.buffer + %buffer_10 = hal.buffer.subspan<%transient_buffer : !hal.buffer>[%c64, %c8] : !hal.buffer + %dense_row_major_11 = hal.encoding_type : i32 + %element_type_f32_12 = hal.element_type : i32 + %c2_13 = arith.constant 2 : index + %c0_14 = arith.constant 0 : index + %view = hal.buffer_view.create buffer(%buffer_9 : !hal.buffer)[%c0_14, %c8] shape([%c2_13]) type(%element_type_f32_12) encoding(%dense_row_major_11) : !hal.buffer_view + %dense_row_major_15 = hal.encoding_type : i32 + %element_type_f32_16 = hal.element_type : i32 + %c2_17 = arith.constant 2 : index + %c0_18 = arith.constant 0 : index + %view_19 = hal.buffer_view.create buffer(%buffer_10 : !hal.buffer)[%c0_18, %c8] shape([%c2_17]) type(%element_type_f32_16) encoding(%dense_row_major_15) : !hal.buffer_view + util.return %view, %view_19 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After OutlineMemoizeRegionsPass (iree-hal-outline-memoize-regions) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding], flags = Indirect> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module { + util.global private @__device_0 = #device_target_local + hal.executable private @multiple_results_dispatch_0 { + hal.executable.variant public @embedded_elf_arm_64 target(#executable_target_embedded_elf_arm_64) { + hal.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 ordinal(0) layout(#pipeline_layout) attributes {workgroup_size = [1 : index, 1 : index, 1 : index]} + builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", llvm.target_triple = "arm64-unknown-unknown-eabi-elf"} { + llvm.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}) -> i32 { + %0 = llvm.mlir.constant(0 : i32) : i32 + %1 = llvm.mlir.constant(8 : i64) : i64 + %2 = llvm.mlir.constant(32 : i64) : i64 + %3 = llvm.mlir.constant(64 : index) : i64 + %4 = llvm.mlir.constant(true) : i1 + %5 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %6 = llvm.extractvalue %5[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %7 = llvm.load %6 : !llvm.ptr -> i32 + %8 = llvm.zext %7 : i32 to i64 + %9 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %10 = llvm.load %9 : !llvm.ptr -> !llvm.ptr + llvm.intr.assume %4 ["align"(%10, %3 : !llvm.ptr, i64)] : i1 + %11 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %12 = llvm.extractvalue %11[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %13 = llvm.getelementptr %12[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr + %14 = llvm.load %13 : !llvm.ptr -> !llvm.ptr + %15 = llvm.mul %8, %1 : i64 + %16 = llvm.udiv %15, %2 : i64 + %17 = llvm.getelementptr %14[%16] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.intr.assume %4 ["align"(%17, %3 : !llvm.ptr, i64)] : i1 + %18 = llvm.load %10 {alignment = 4 : i64} : !llvm.ptr -> vector<2xf32> + %19 = llvm.intr.fabs(%18) : (vector<2xf32>) -> vector<2xf32> + llvm.store %19, %17 {alignment = 4 : i64} : vector<2xf32>, !llvm.ptr + llvm.return %0 : i32 + } + } + } + } + util.func private @__multiple_results_memoize_apply(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} { + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %c0_0 = arith.constant 0 : index + %c8 = arith.constant 8 : index + %c128 = arith.constant 128 : index + %c64_i32 = arith.constant 64 : i32 + %c1 = arith.constant 1 : index + cf.br ^bb1 + ^bb1: // pred: ^bb0 + %c3 = arith.constant 3 : index + %cmd = hal.command_buffer.create device(%arg0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%arg1) bindings(%c3) : !hal.command_buffer + %0 = hal.command_buffer.device<%cmd : !hal.command_buffer> : !hal.device + %exe = hal.executable.lookup device(%0 : !hal.device) executable(@multiple_results_dispatch_0) : !hal.executable + %ordinal = hal.executable.export.ordinal target(@multiple_results_dispatch_0::@embedded_elf_arm_64::@multiple_results_dispatch_0_elementwise_2_f32) : index + %c1_1 = arith.constant 1 : index + %c1_2 = arith.constant 1 : index + %c1_3 = arith.constant 1 : index + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c1_1, %c1_2, %c1_3]) constants([%c0_i32]) bindings([ + (%c0 : index)[%c0_0, %c8], + (%c2 : index)[%c0_0, %c128] + ]) flags("None") + %1 = hal.command_buffer.device<%cmd : !hal.command_buffer> : !hal.device + %exe_4 = hal.executable.lookup device(%1 : !hal.device) executable(@multiple_results_dispatch_0) : !hal.executable + %ordinal_5 = hal.executable.export.ordinal target(@multiple_results_dispatch_0::@embedded_elf_arm_64::@multiple_results_dispatch_0_elementwise_2_f32) : index + %c1_6 = arith.constant 1 : index + %c1_7 = arith.constant 1 : index + %c1_8 = arith.constant 1 : index + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe_4 : !hal.executable)[%ordinal_5] workgroups([%c1_6, %c1_7, %c1_8]) constants([%c64_i32]) bindings([ + (%c1 : index)[%c0_0, %c8], + (%c2 : index)[%c0_0, %c128] + ]) flags("None") + hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") + hal.command_buffer.finalize<%cmd : !hal.command_buffer> + util.return %cmd : !hal.command_buffer + } + util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.initializer { + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %c-1_i64 = arith.constant -1 : i64 + %0 = util.call @__multiple_results_memoize_apply(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer + util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.return + } + util.func private @__multiple_results_memoize_lookup(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer { + %__device_0 = util.global.load @__device_0 : !hal.device + %0 = util.cmp.eq %arg0, %__device_0 : !hal.device + %1 = scf.if %0 -> (!hal.command_buffer) { + %__multiple_results_memoize_result_0_device_0 = util.global.load @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + scf.yield %__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + } else { + %2 = util.null : !hal.command_buffer + scf.yield %2 : !hal.command_buffer + } + util.return %1 : !hal.command_buffer + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c64_i32 = arith.constant 64 : i32 + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator + hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer + %__device_0_1 = util.global.load immutable @__device_0 : !hal.device + %allocator_2 = hal.device.allocator<%__device_0_1 : !hal.device> : !hal.allocator + hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator_2 : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + %memory_types, %buffer_usage = hal.allocator.resolve_memory_properties for(#hal.device.affinity<@__device_0>) lifetime(external) : i32, i32 + %__device_0_3 = util.global.load immutable @__device_0 : !hal.device + %c-1_i64 = arith.constant -1 : i64 + %0 = util.null : !hal.fence + %fence = hal.fence.create device(%__device_0_3 : !hal.device) flags("None") : !hal.fence + %c0_i64 = arith.constant 0 : i64 + %transient_buffer = hal.device.queue.alloca<%__device_0_3 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_types) usage(%buffer_usage) flags("None") : !hal.buffer{%c128} + %__device_0_4 = util.global.load immutable @__device_0 : !hal.device + %c-1_i64_5 = arith.constant -1 : i64 + %c0_6 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2_7 = arith.constant 2 : index + %1 = util.call @__multiple_results_memoize_lookup(%__device_0_4, %c-1_i64_5) : (!hal.device, i64) -> !hal.command_buffer + %fence_8 = hal.fence.create device(%__device_0_4 : !hal.device) flags("None") : !hal.fence + hal.device.queue.execute.indirect<%__device_0_4 : !hal.device> affinity(%c-1_i64_5) wait(%fence) signal(%fence_8) commands(%1) bindings([ + (%buffer : !hal.buffer)[%c0_6, %c8], + (%buffer_0 : !hal.buffer)[%c0_6, %c8], + (%transient_buffer : !hal.buffer)[%c0_6, %c128] + ]) flags("None") + %c-1_i32 = arith.constant -1 : i32 + %status = hal.fence.await until([%fence_8]) timeout_millis(%c-1_i32) flags("None") : i32 + util.status.check_ok %status, "failed to wait on timepoint" + %buffer_9 = hal.buffer.subspan<%transient_buffer : !hal.buffer>[%c0, %c8] : !hal.buffer + %buffer_10 = hal.buffer.subspan<%transient_buffer : !hal.buffer>[%c64, %c8] : !hal.buffer + %dense_row_major_11 = hal.encoding_type : i32 + %element_type_f32_12 = hal.element_type : i32 + %c2_13 = arith.constant 2 : index + %c0_14 = arith.constant 0 : index + %view = hal.buffer_view.create buffer(%buffer_9 : !hal.buffer)[%c0_14, %c8] shape([%c2_13]) type(%element_type_f32_12) encoding(%dense_row_major_11) : !hal.buffer_view + %dense_row_major_15 = hal.encoding_type : i32 + %element_type_f32_16 = hal.element_type : i32 + %c2_17 = arith.constant 2 : index + %c0_18 = arith.constant 0 : index + %view_19 = hal.buffer_view.create buffer(%buffer_10 : !hal.buffer)[%c0_18, %c8] shape([%c2_17]) type(%element_type_f32_16) encoding(%dense_row_major_15) : !hal.buffer_view + util.return %view, %view_19 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After Canonicalizer (canonicalize) //----- // +util.initializer { + %c-1_i64 = arith.constant -1 : i64 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %0 = util.call @__multiple_results_memoize_apply(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer + util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.return +} + +// -----// IR Dump After Canonicalizer (canonicalize) //----- // +util.func private @__multiple_results_memoize_lookup(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer { + %0 = util.null : !hal.command_buffer + %__device_0 = util.global.load @__device_0 : !hal.device + %1 = util.cmp.eq %arg0, %__device_0 : !hal.device + %2 = scf.if %1 -> (!hal.command_buffer) { + %__multiple_results_memoize_result_0_device_0 = util.global.load @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + scf.yield %__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + } else { + scf.yield %0 : !hal.command_buffer + } + util.return %2 : !hal.command_buffer +} + +// -----// IR Dump After Canonicalizer (canonicalize) //----- // +util.func private @__multiple_results_memoize_apply(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} { + %c3 = arith.constant 3 : index + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c128 = arith.constant 128 : index + %c64_i32 = arith.constant 64 : i32 + %c1 = arith.constant 1 : index + %cmd = hal.command_buffer.create device(%arg0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%arg1) bindings(%c3) : !hal.command_buffer + %exe = hal.executable.lookup device(%arg0 : !hal.device) executable(@multiple_results_dispatch_0) : !hal.executable + %ordinal = hal.executable.export.ordinal target(@multiple_results_dispatch_0::@embedded_elf_arm_64::@multiple_results_dispatch_0_elementwise_2_f32) : index + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([ + (%c0 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + %exe_0 = hal.executable.lookup device(%arg0 : !hal.device) executable(@multiple_results_dispatch_0) : !hal.executable + %ordinal_1 = hal.executable.export.ordinal target(@multiple_results_dispatch_0::@embedded_elf_arm_64::@multiple_results_dispatch_0_elementwise_2_f32) : index + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe_0 : !hal.executable)[%ordinal_1] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([ + (%c1 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") + hal.command_buffer.finalize<%cmd : !hal.command_buffer> + util.return %cmd : !hal.command_buffer +} + +// -----// IR Dump After Canonicalizer (canonicalize) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c-1_i32 = arith.constant -1 : i32 + %c0_i64 = arith.constant 0 : i64 + %0 = util.null : !hal.fence + %c-1_i64 = arith.constant -1 : i64 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator + hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer + %__device_0_1 = util.global.load immutable @__device_0 : !hal.device + %allocator_2 = hal.device.allocator<%__device_0_1 : !hal.device> : !hal.allocator + hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator_2 : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + %memory_types, %buffer_usage = hal.allocator.resolve_memory_properties for(#hal.device.affinity<@__device_0>) lifetime(external) : i32, i32 + %__device_0_3 = util.global.load immutable @__device_0 : !hal.device + %fence = hal.fence.create device(%__device_0_3 : !hal.device) flags("None") : !hal.fence + %transient_buffer = hal.device.queue.alloca<%__device_0_3 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_types) usage(%buffer_usage) flags("None") : !hal.buffer{%c128} + %__device_0_4 = util.global.load immutable @__device_0 : !hal.device + %1 = util.call @__multiple_results_memoize_lookup(%__device_0_4, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer + %fence_5 = hal.fence.create device(%__device_0_4 : !hal.device) flags("None") : !hal.fence + hal.device.queue.execute.indirect<%__device_0_4 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_5) commands(%1) bindings([ + (%buffer : !hal.buffer)[%c0, %c8], + (%buffer_0 : !hal.buffer)[%c0, %c8], + (%transient_buffer : !hal.buffer)[%c0, %c128] + ]) flags("None") + %status = hal.fence.await until([%fence_5]) timeout_millis(%c-1_i32) flags("None") : i32 + util.status.check_ok %status, "failed to wait on timepoint" + %dense_row_major_6 = hal.encoding_type : i32 + %element_type_f32_7 = hal.element_type : i32 + %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32_7) encoding(%dense_row_major_6) : !hal.buffer_view + %dense_row_major_8 = hal.encoding_type : i32 + %element_type_f32_9 = hal.element_type : i32 + %view_10 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32_9) encoding(%dense_row_major_8) : !hal.buffer_view + util.return %view, %view_10 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After CSE (cse) //----- // +util.func private @__multiple_results_memoize_lookup(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer { + %0 = util.null : !hal.command_buffer + %__device_0 = util.global.load @__device_0 : !hal.device + %1 = util.cmp.eq %arg0, %__device_0 : !hal.device + %2 = scf.if %1 -> (!hal.command_buffer) { + %__multiple_results_memoize_result_0_device_0 = util.global.load @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + scf.yield %__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + } else { + scf.yield %0 : !hal.command_buffer + } + util.return %2 : !hal.command_buffer +} + +// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // +util.func private @__multiple_results_memoize_lookup(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer { + %__device_0 = util.global.load @__device_0 : !hal.device + %0 = util.null : !hal.command_buffer + %1 = util.cmp.eq %arg0, %__device_0 : !hal.device + %2 = scf.if %1 -> (!hal.command_buffer) { + %__multiple_results_memoize_result_0_device_0 = util.global.load @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + scf.yield %__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + } else { + scf.yield %0 : !hal.command_buffer + } + util.return %2 : !hal.command_buffer +} + +// -----// IR Dump After CSE (cse) //----- // +util.initializer { + %c-1_i64 = arith.constant -1 : i64 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %0 = util.call @__multiple_results_memoize_apply(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer + util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.return +} + +// -----// IR Dump After CSE (cse) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c-1_i32 = arith.constant -1 : i32 + %c0_i64 = arith.constant 0 : i64 + %0 = util.null : !hal.fence + %c-1_i64 = arith.constant -1 : i64 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator + hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer + hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + %memory_types, %buffer_usage = hal.allocator.resolve_memory_properties for(#hal.device.affinity<@__device_0>) lifetime(external) : i32, i32 + %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_types) usage(%buffer_usage) flags("None") : !hal.buffer{%c128} + %1 = util.call @__multiple_results_memoize_lookup(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer + %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%1) bindings([ + (%buffer : !hal.buffer)[%c0, %c8], + (%buffer_0 : !hal.buffer)[%c0, %c8], + (%transient_buffer : !hal.buffer)[%c0, %c128] + ]) flags("None") + %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32 + util.status.check_ok %status, "failed to wait on timepoint" + %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After CSE (cse) //----- // +util.func private @__multiple_results_memoize_apply(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} { + %c3 = arith.constant 3 : index + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c128 = arith.constant 128 : index + %c64_i32 = arith.constant 64 : i32 + %c1 = arith.constant 1 : index + %cmd = hal.command_buffer.create device(%arg0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%arg1) bindings(%c3) : !hal.command_buffer + %exe = hal.executable.lookup device(%arg0 : !hal.device) executable(@multiple_results_dispatch_0) : !hal.executable + %ordinal = hal.executable.export.ordinal target(@multiple_results_dispatch_0::@embedded_elf_arm_64::@multiple_results_dispatch_0_elementwise_2_f32) : index + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([ + (%c0 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([ + (%c1 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") + hal.command_buffer.finalize<%cmd : !hal.command_buffer> + util.return %cmd : !hal.command_buffer +} + +// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // +util.func private @__multiple_results_memoize_lookup(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer { + %0 = util.null : !hal.command_buffer + %__device_0 = util.global.load @__device_0 : !hal.device + %1 = util.cmp.eq %arg0, %__device_0 : !hal.device + %2 = scf.if %1 -> (!hal.command_buffer) { + %__multiple_results_memoize_result_0_device_0 = util.global.load @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + scf.yield %__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + } else { + scf.yield %0 : !hal.command_buffer + } + util.return %2 : !hal.command_buffer +} + +// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %c-1_i32 = arith.constant -1 : i32 + %c0_i64 = arith.constant 0 : i64 + %0 = util.null : !hal.fence + %c-1_i64 = arith.constant -1 : i64 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer + %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator + hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer + hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + %memory_types, %buffer_usage = hal.allocator.resolve_memory_properties for(#hal.device.affinity<@__device_0>) lifetime(external) : i32, i32 + %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_types) usage(%buffer_usage) flags("None") : !hal.buffer{%c128} + %1 = util.call @__multiple_results_memoize_lookup(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer + %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%1) bindings([ + (%buffer : !hal.buffer)[%c0, %c8], + (%buffer_0 : !hal.buffer)[%c0, %c8], + (%transient_buffer : !hal.buffer)[%c0, %c128] + ]) flags("None") + %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32 + util.status.check_ok %status, "failed to wait on timepoint" + %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // +util.initializer { + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %c-1_i64 = arith.constant -1 : i64 + %0 = util.call @__multiple_results_memoize_apply(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer + util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.return +} + +// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // +util.func private @__multiple_results_memoize_apply(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} { + %c3 = arith.constant 3 : index + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c128 = arith.constant 128 : index + %c64_i32 = arith.constant 64 : i32 + %c1 = arith.constant 1 : index + %cmd = hal.command_buffer.create device(%arg0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%arg1) bindings(%c3) : !hal.command_buffer + %exe = hal.executable.lookup device(%arg0 : !hal.device) executable(@multiple_results_dispatch_0) : !hal.executable + %ordinal = hal.executable.export.ordinal target(@multiple_results_dispatch_0::@embedded_elf_arm_64::@multiple_results_dispatch_0_elementwise_2_f32) : index + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([ + (%c0 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([ + (%c1 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") + hal.command_buffer.finalize<%cmd : !hal.command_buffer> + util.return %cmd : !hal.command_buffer +} + +// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // +util.initializer { + %c-1_i64 = arith.constant -1 : i64 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %0 = util.call @__multiple_results_memoize_apply(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer + util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.return +} + +// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c64 = arith.constant 64 : index + %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index + %c-1_i64 = arith.constant -1 : i64 + %0 = util.null : !hal.fence + %c0_i64 = arith.constant 0 : i64 + %c-1_i32 = arith.constant -1 : i32 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer + %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator + hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer + hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + %memory_types, %buffer_usage = hal.allocator.resolve_memory_properties for(#hal.device.affinity<@__device_0>) lifetime(external) : i32, i32 + %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_types) usage(%buffer_usage) flags("None") : !hal.buffer{%c128} + %1 = util.call @__multiple_results_memoize_lookup(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer + %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%1) bindings([ + (%buffer : !hal.buffer)[%c0, %c8], + (%buffer_0 : !hal.buffer)[%c0, %c8], + (%transient_buffer : !hal.buffer)[%c0, %c128] + ]) flags("None") + %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32 + util.status.check_ok %status, "failed to wait on timepoint" + %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // +util.func private @__multiple_results_memoize_apply(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} { + %c3 = arith.constant 3 : index + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c128 = arith.constant 128 : index + %c64_i32 = arith.constant 64 : i32 + %c1 = arith.constant 1 : index + %cmd = hal.command_buffer.create device(%arg0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%arg1) bindings(%c3) : !hal.command_buffer + %exe = hal.executable.lookup device(%arg0 : !hal.device) executable(@multiple_results_dispatch_0) : !hal.executable + %ordinal = hal.executable.export.ordinal target(@multiple_results_dispatch_0::@embedded_elf_arm_64::@multiple_results_dispatch_0_elementwise_2_f32) : index + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([ + (%c0 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([ + (%c1 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") + hal.command_buffer.finalize<%cmd : !hal.command_buffer> + util.return %cmd : !hal.command_buffer +} + +// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding], flags = Indirect> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module { + util.global private @__device_0 = #device_target_local + hal.executable private @multiple_results_dispatch_0 { + hal.executable.variant public @embedded_elf_arm_64 target(#executable_target_embedded_elf_arm_64) { + hal.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 ordinal(0) layout(#pipeline_layout) attributes {workgroup_size = [1 : index, 1 : index, 1 : index]} + builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", llvm.target_triple = "arm64-unknown-unknown-eabi-elf"} { + llvm.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}) -> i32 { + %0 = llvm.mlir.constant(0 : i32) : i32 + %1 = llvm.mlir.constant(8 : i64) : i64 + %2 = llvm.mlir.constant(32 : i64) : i64 + %3 = llvm.mlir.constant(64 : index) : i64 + %4 = llvm.mlir.constant(true) : i1 + %5 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %6 = llvm.extractvalue %5[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %7 = llvm.load %6 : !llvm.ptr -> i32 + %8 = llvm.zext %7 : i32 to i64 + %9 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %10 = llvm.load %9 : !llvm.ptr -> !llvm.ptr + llvm.intr.assume %4 ["align"(%10, %3 : !llvm.ptr, i64)] : i1 + %11 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %12 = llvm.extractvalue %11[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %13 = llvm.getelementptr %12[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr + %14 = llvm.load %13 : !llvm.ptr -> !llvm.ptr + %15 = llvm.mul %8, %1 : i64 + %16 = llvm.udiv %15, %2 : i64 + %17 = llvm.getelementptr %14[%16] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.intr.assume %4 ["align"(%17, %3 : !llvm.ptr, i64)] : i1 + %18 = llvm.load %10 {alignment = 4 : i64} : !llvm.ptr -> vector<2xf32> + %19 = llvm.intr.fabs(%18) : (vector<2xf32>) -> vector<2xf32> + llvm.store %19, %17 {alignment = 4 : i64} : vector<2xf32>, !llvm.ptr + llvm.return %0 : i32 + } + } + } + } + util.func private @__multiple_results_memoize_apply(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} { + %c3 = arith.constant 3 : index + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c128 = arith.constant 128 : index + %c64_i32 = arith.constant 64 : i32 + %c1 = arith.constant 1 : index + %cmd = hal.command_buffer.create device(%arg0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%arg1) bindings(%c3) : !hal.command_buffer + %exe = hal.executable.lookup device(%arg0 : !hal.device) executable(@multiple_results_dispatch_0) : !hal.executable + %ordinal = hal.executable.export.ordinal target(@multiple_results_dispatch_0::@embedded_elf_arm_64::@multiple_results_dispatch_0_elementwise_2_f32) : index + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([ + (%c0 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([ + (%c1 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") + hal.command_buffer.finalize<%cmd : !hal.command_buffer> + util.return %cmd : !hal.command_buffer + } + util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.initializer { + %c-1_i64 = arith.constant -1 : i64 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %0 = util.call @__multiple_results_memoize_apply(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer + util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.return + } + util.func private @__multiple_results_memoize_lookup(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer { + %0 = util.null : !hal.command_buffer + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %1 = util.cmp.eq %arg0, %__device_0 : !hal.device + %2 = scf.if %1 -> (!hal.command_buffer) { + %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + scf.yield %__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + } else { + scf.yield %0 : !hal.command_buffer + } + util.return %2 : !hal.command_buffer + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c64 = arith.constant 64 : index + %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index + %c-1_i64 = arith.constant -1 : i64 + %0 = util.null : !hal.fence + %c0_i64 = arith.constant 0 : i64 + %c-1_i32 = arith.constant -1 : i32 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer + %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator + hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer + hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + %memory_types, %buffer_usage = hal.allocator.resolve_memory_properties for(#hal.device.affinity<@__device_0>) lifetime(external) : i32, i32 + %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_types) usage(%buffer_usage) flags("None") : !hal.buffer{%c128} + %1 = util.call @__multiple_results_memoize_lookup(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer + %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%1) bindings([ + (%buffer : !hal.buffer)[%c0, %c8], + (%buffer_0 : !hal.buffer)[%c0, %c8], + (%transient_buffer : !hal.buffer)[%c0, %c128] + ]) flags("None") + %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32 + util.status.check_ok %status, "failed to wait on timepoint" + %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding], flags = Indirect> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module { + util.global private @__device_0 = #device_target_local + hal.executable private @multiple_results_dispatch_0 { + hal.executable.variant public @embedded_elf_arm_64 target(#executable_target_embedded_elf_arm_64) { + hal.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 ordinal(0) layout(#pipeline_layout) attributes {workgroup_size = [1 : index, 1 : index, 1 : index]} + builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", llvm.target_triple = "arm64-unknown-unknown-eabi-elf"} { + llvm.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}) -> i32 { + %0 = llvm.mlir.constant(0 : i32) : i32 + %1 = llvm.mlir.constant(8 : i64) : i64 + %2 = llvm.mlir.constant(32 : i64) : i64 + %3 = llvm.mlir.constant(64 : index) : i64 + %4 = llvm.mlir.constant(true) : i1 + %5 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %6 = llvm.extractvalue %5[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %7 = llvm.load %6 : !llvm.ptr -> i32 + %8 = llvm.zext %7 : i32 to i64 + %9 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %10 = llvm.load %9 : !llvm.ptr -> !llvm.ptr + llvm.intr.assume %4 ["align"(%10, %3 : !llvm.ptr, i64)] : i1 + %11 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %12 = llvm.extractvalue %11[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %13 = llvm.getelementptr %12[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr + %14 = llvm.load %13 : !llvm.ptr -> !llvm.ptr + %15 = llvm.mul %8, %1 : i64 + %16 = llvm.udiv %15, %2 : i64 + %17 = llvm.getelementptr %14[%16] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.intr.assume %4 ["align"(%17, %3 : !llvm.ptr, i64)] : i1 + %18 = llvm.load %10 {alignment = 4 : i64} : !llvm.ptr -> vector<2xf32> + %19 = llvm.intr.fabs(%18) : (vector<2xf32>) -> vector<2xf32> + llvm.store %19, %17 {alignment = 4 : i64} : vector<2xf32>, !llvm.ptr + llvm.return %0 : i32 + } + } + } + } + util.func private @__multiple_results_memoize_apply(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} { + %c3 = arith.constant 3 : index + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c128 = arith.constant 128 : index + %c64_i32 = arith.constant 64 : i32 + %c1 = arith.constant 1 : index + %cmd = hal.command_buffer.create device(%arg0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%arg1) bindings(%c3) : !hal.command_buffer + %exe = hal.executable.lookup device(%arg0 : !hal.device) executable(@multiple_results_dispatch_0) : !hal.executable + %ordinal = hal.executable.export.ordinal target(@multiple_results_dispatch_0::@embedded_elf_arm_64::@multiple_results_dispatch_0_elementwise_2_f32) : index + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([ + (%c0 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([ + (%c1 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") + hal.command_buffer.finalize<%cmd : !hal.command_buffer> + util.return %cmd : !hal.command_buffer + } + util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.initializer { + %c-1_i64 = arith.constant -1 : i64 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %0 = util.call @__multiple_results_memoize_apply(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer + util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.return + } + util.func private @__multiple_results_memoize_lookup(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer { + %0 = util.null : !hal.command_buffer + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %1 = util.cmp.eq %arg0, %__device_0 : !hal.device + %2 = scf.if %1 -> (!hal.command_buffer) { + %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + scf.yield %__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + } else { + scf.yield %0 : !hal.command_buffer + } + util.return %2 : !hal.command_buffer + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c64 = arith.constant 64 : index + %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index + %c-1_i64 = arith.constant -1 : i64 + %0 = util.null : !hal.fence + %c0_i64 = arith.constant 0 : i64 + %c-1_i32 = arith.constant -1 : i32 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer + %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator + hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer + hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + %memory_types, %buffer_usage = hal.allocator.resolve_memory_properties for(#hal.device.affinity<@__device_0>) lifetime(external) : i32, i32 + %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_types) usage(%buffer_usage) flags("None") : !hal.buffer{%c128} + %1 = util.call @__multiple_results_memoize_lookup(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer + %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%1) bindings([ + (%buffer : !hal.buffer)[%c0, %c8], + (%buffer_0 : !hal.buffer)[%c0, %c8], + (%transient_buffer : !hal.buffer)[%c0, %c128] + ]) flags("None") + %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32 + util.status.check_ok %status, "failed to wait on timepoint" + %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After PruneExecutablesPass (iree-hal-prune-executables) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding], flags = Indirect> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module { + util.global private @__device_0 = #device_target_local + hal.executable private @multiple_results_dispatch_0 { + hal.executable.variant public @embedded_elf_arm_64 target(#executable_target_embedded_elf_arm_64) { + hal.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 ordinal(0) layout(#pipeline_layout) attributes {workgroup_size = [1 : index, 1 : index, 1 : index]} + builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", llvm.target_triple = "arm64-unknown-unknown-eabi-elf"} { + llvm.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}) -> i32 { + %0 = llvm.mlir.constant(0 : i32) : i32 + %1 = llvm.mlir.constant(8 : i64) : i64 + %2 = llvm.mlir.constant(32 : i64) : i64 + %3 = llvm.mlir.constant(64 : index) : i64 + %4 = llvm.mlir.constant(true) : i1 + %5 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %6 = llvm.extractvalue %5[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %7 = llvm.load %6 : !llvm.ptr -> i32 + %8 = llvm.zext %7 : i32 to i64 + %9 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %10 = llvm.load %9 : !llvm.ptr -> !llvm.ptr + llvm.intr.assume %4 ["align"(%10, %3 : !llvm.ptr, i64)] : i1 + %11 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %12 = llvm.extractvalue %11[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %13 = llvm.getelementptr %12[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr + %14 = llvm.load %13 : !llvm.ptr -> !llvm.ptr + %15 = llvm.mul %8, %1 : i64 + %16 = llvm.udiv %15, %2 : i64 + %17 = llvm.getelementptr %14[%16] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.intr.assume %4 ["align"(%17, %3 : !llvm.ptr, i64)] : i1 + %18 = llvm.load %10 {alignment = 4 : i64} : !llvm.ptr -> vector<2xf32> + %19 = llvm.intr.fabs(%18) : (vector<2xf32>) -> vector<2xf32> + llvm.store %19, %17 {alignment = 4 : i64} : vector<2xf32>, !llvm.ptr + llvm.return %0 : i32 + } + } + } + } + util.func private @__multiple_results_memoize_apply(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} { + %c3 = arith.constant 3 : index + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c128 = arith.constant 128 : index + %c64_i32 = arith.constant 64 : i32 + %c1 = arith.constant 1 : index + %cmd = hal.command_buffer.create device(%arg0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%arg1) bindings(%c3) : !hal.command_buffer + %exe = hal.executable.lookup device(%arg0 : !hal.device) executable(@multiple_results_dispatch_0) : !hal.executable + %ordinal = hal.executable.export.ordinal target(@multiple_results_dispatch_0::@embedded_elf_arm_64::@multiple_results_dispatch_0_elementwise_2_f32) : index + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([ + (%c0 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([ + (%c1 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") + hal.command_buffer.finalize<%cmd : !hal.command_buffer> + util.return %cmd : !hal.command_buffer + } + util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.initializer { + %c-1_i64 = arith.constant -1 : i64 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %0 = util.call @__multiple_results_memoize_apply(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer + util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.return + } + util.func private @__multiple_results_memoize_lookup(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer { + %0 = util.null : !hal.command_buffer + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %1 = util.cmp.eq %arg0, %__device_0 : !hal.device + %2 = scf.if %1 -> (!hal.command_buffer) { + %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + scf.yield %__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + } else { + scf.yield %0 : !hal.command_buffer + } + util.return %2 : !hal.command_buffer + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c64 = arith.constant 64 : index + %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index + %c-1_i64 = arith.constant -1 : i64 + %0 = util.null : !hal.fence + %c0_i64 = arith.constant 0 : i64 + %c-1_i32 = arith.constant -1 : i32 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer + %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator + hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer + hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + %memory_types, %buffer_usage = hal.allocator.resolve_memory_properties for(#hal.device.affinity<@__device_0>) lifetime(external) : i32, i32 + %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_types) usage(%buffer_usage) flags("None") : !hal.buffer{%c128} + %1 = util.call @__multiple_results_memoize_lookup(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer + %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%1) bindings([ + (%buffer : !hal.buffer)[%c0, %c8], + (%buffer_0 : !hal.buffer)[%c0, %c8], + (%transient_buffer : !hal.buffer)[%c0, %c128] + ]) flags("None") + %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32 + util.status.check_ok %status, "failed to wait on timepoint" + %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After LLVMCPULinkExecutablesPass (iree-llvmcpu-link-executables) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding], flags = Indirect> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module { + util.global private @__device_0 = #device_target_local + hal.executable private @multiple_results_dispatch_0 { + hal.executable.variant public @embedded_elf_arm_64 target(#executable_target_embedded_elf_arm_64) { + hal.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 ordinal(0) layout(#pipeline_layout) attributes {workgroup_size = [1 : index, 1 : index, 1 : index]} + builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", llvm.target_triple = "arm64-unknown-unknown-eabi-elf"} { + llvm.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}) -> i32 { + %0 = llvm.mlir.constant(0 : i32) : i32 + %1 = llvm.mlir.constant(8 : i64) : i64 + %2 = llvm.mlir.constant(32 : i64) : i64 + %3 = llvm.mlir.constant(64 : index) : i64 + %4 = llvm.mlir.constant(true) : i1 + %5 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %6 = llvm.extractvalue %5[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %7 = llvm.load %6 : !llvm.ptr -> i32 + %8 = llvm.zext %7 : i32 to i64 + %9 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %10 = llvm.load %9 : !llvm.ptr -> !llvm.ptr + llvm.intr.assume %4 ["align"(%10, %3 : !llvm.ptr, i64)] : i1 + %11 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %12 = llvm.extractvalue %11[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %13 = llvm.getelementptr %12[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr + %14 = llvm.load %13 : !llvm.ptr -> !llvm.ptr + %15 = llvm.mul %8, %1 : i64 + %16 = llvm.udiv %15, %2 : i64 + %17 = llvm.getelementptr %14[%16] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.intr.assume %4 ["align"(%17, %3 : !llvm.ptr, i64)] : i1 + %18 = llvm.load %10 {alignment = 4 : i64} : !llvm.ptr -> vector<2xf32> + %19 = llvm.intr.fabs(%18) : (vector<2xf32>) -> vector<2xf32> + llvm.store %19, %17 {alignment = 4 : i64} : vector<2xf32>, !llvm.ptr + llvm.return %0 : i32 + } + } + } + } + util.func private @__multiple_results_memoize_apply(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} { + %c3 = arith.constant 3 : index + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c128 = arith.constant 128 : index + %c64_i32 = arith.constant 64 : i32 + %c1 = arith.constant 1 : index + %cmd = hal.command_buffer.create device(%arg0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%arg1) bindings(%c3) : !hal.command_buffer + %exe = hal.executable.lookup device(%arg0 : !hal.device) executable(@multiple_results_dispatch_0) : !hal.executable + %ordinal = hal.executable.export.ordinal target(@multiple_results_dispatch_0::@embedded_elf_arm_64::@multiple_results_dispatch_0_elementwise_2_f32) : index + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([ + (%c0 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([ + (%c1 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") + hal.command_buffer.finalize<%cmd : !hal.command_buffer> + util.return %cmd : !hal.command_buffer + } + util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.initializer { + %c-1_i64 = arith.constant -1 : i64 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %0 = util.call @__multiple_results_memoize_apply(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer + util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.return + } + util.func private @__multiple_results_memoize_lookup(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer { + %0 = util.null : !hal.command_buffer + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %1 = util.cmp.eq %arg0, %__device_0 : !hal.device + %2 = scf.if %1 -> (!hal.command_buffer) { + %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + scf.yield %__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + } else { + scf.yield %0 : !hal.command_buffer + } + util.return %2 : !hal.command_buffer + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c64 = arith.constant 64 : index + %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index + %c-1_i64 = arith.constant -1 : i64 + %0 = util.null : !hal.fence + %c0_i64 = arith.constant 0 : i64 + %c-1_i32 = arith.constant -1 : i32 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer + %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator + hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer + hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + %memory_types, %buffer_usage = hal.allocator.resolve_memory_properties for(#hal.device.affinity<@__device_0>) lifetime(external) : i32, i32 + %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_types) usage(%buffer_usage) flags("None") : !hal.buffer{%c128} + %1 = util.call @__multiple_results_memoize_lookup(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer + %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%1) bindings([ + (%buffer : !hal.buffer)[%c0, %c8], + (%buffer_0 : !hal.buffer)[%c0, %c8], + (%transient_buffer : !hal.buffer)[%c0, %c128] + ]) flags("None") + %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32 + util.status.check_ok %status, "failed to wait on timepoint" + %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After Canonicalizer (canonicalize) //----- // +hal.executable private @multiple_results_dispatch_0 { + hal.executable.variant public @embedded_elf_arm_64 target(<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>) { + hal.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 ordinal(0) layout(#hal.pipeline.layout, #hal.pipeline.binding], flags = Indirect>) attributes {workgroup_size = [1 : index, 1 : index, 1 : index]} + builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", llvm.target_triple = "arm64-unknown-unknown-eabi-elf"} { + llvm.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}) -> i32 { + %0 = llvm.mlir.constant(0 : i32) : i32 + %1 = llvm.mlir.constant(8 : i64) : i64 + %2 = llvm.mlir.constant(32 : i64) : i64 + %3 = llvm.mlir.constant(64 : index) : i64 + %4 = llvm.mlir.constant(true) : i1 + %5 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %6 = llvm.extractvalue %5[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %7 = llvm.load %6 : !llvm.ptr -> i32 + %8 = llvm.zext %7 : i32 to i64 + %9 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %10 = llvm.load %9 : !llvm.ptr -> !llvm.ptr + llvm.intr.assume %4 ["align"(%10, %3 : !llvm.ptr, i64)] : i1 + %11 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %12 = llvm.extractvalue %11[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %13 = llvm.getelementptr %12[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr + %14 = llvm.load %13 : !llvm.ptr -> !llvm.ptr + %15 = llvm.mul %8, %1 : i64 + %16 = llvm.udiv %15, %2 : i64 + %17 = llvm.getelementptr %14[%16] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.intr.assume %4 ["align"(%17, %3 : !llvm.ptr, i64)] : i1 + %18 = llvm.load %10 {alignment = 4 : i64} : !llvm.ptr -> vector<2xf32> + %19 = llvm.intr.fabs(%18) : (vector<2xf32>) -> vector<2xf32> + llvm.store %19, %17 {alignment = 4 : i64} : vector<2xf32>, !llvm.ptr + llvm.return %0 : i32 + } + } + } +} + +// -----// IR Dump After LLVMCPUAssignConstantOrdinalsPass (iree-llvmcpu-assign-constant-ordinals) //----- // +hal.executable.variant public @embedded_elf_arm_64 target(<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>) { + hal.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 ordinal(0) layout(#hal.pipeline.layout, #hal.pipeline.binding], flags = Indirect>) attributes {workgroup_size = [1 : index, 1 : index, 1 : index]} + builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", llvm.target_triple = "arm64-unknown-unknown-eabi-elf"} { + llvm.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}) -> i32 { + %0 = llvm.mlir.constant(0 : i32) : i32 + %1 = llvm.mlir.constant(8 : i64) : i64 + %2 = llvm.mlir.constant(32 : i64) : i64 + %3 = llvm.mlir.constant(64 : index) : i64 + %4 = llvm.mlir.constant(true) : i1 + %5 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %6 = llvm.extractvalue %5[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %7 = llvm.load %6 : !llvm.ptr -> i32 + %8 = llvm.zext %7 : i32 to i64 + %9 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %10 = llvm.load %9 : !llvm.ptr -> !llvm.ptr + llvm.intr.assume %4 ["align"(%10, %3 : !llvm.ptr, i64)] : i1 + %11 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %12 = llvm.extractvalue %11[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %13 = llvm.getelementptr %12[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr + %14 = llvm.load %13 : !llvm.ptr -> !llvm.ptr + %15 = llvm.mul %8, %1 : i64 + %16 = llvm.udiv %15, %2 : i64 + %17 = llvm.getelementptr %14[%16] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.intr.assume %4 ["align"(%17, %3 : !llvm.ptr, i64)] : i1 + %18 = llvm.load %10 {alignment = 4 : i64} : !llvm.ptr -> vector<2xf32> + %19 = llvm.intr.fabs(%18) : (vector<2xf32>) -> vector<2xf32> + llvm.store %19, %17 {alignment = 4 : i64} : vector<2xf32>, !llvm.ptr + llvm.return %0 : i32 + } + } +} + +// -----// IR Dump After LLVMCPUAssignImportOrdinalsPass (iree-llvmcpu-assign-import-ordinals) //----- // +hal.executable.variant public @embedded_elf_arm_64 target(<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>) { + hal.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 ordinal(0) layout(#hal.pipeline.layout, #hal.pipeline.binding], flags = Indirect>) attributes {workgroup_size = [1 : index, 1 : index, 1 : index]} + builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", llvm.target_triple = "arm64-unknown-unknown-eabi-elf"} { + llvm.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}) -> i32 { + %0 = llvm.mlir.constant(0 : i32) : i32 + %1 = llvm.mlir.constant(8 : i64) : i64 + %2 = llvm.mlir.constant(32 : i64) : i64 + %3 = llvm.mlir.constant(64 : index) : i64 + %4 = llvm.mlir.constant(true) : i1 + %5 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %6 = llvm.extractvalue %5[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %7 = llvm.load %6 : !llvm.ptr -> i32 + %8 = llvm.zext %7 : i32 to i64 + %9 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %10 = llvm.load %9 : !llvm.ptr -> !llvm.ptr + llvm.intr.assume %4 ["align"(%10, %3 : !llvm.ptr, i64)] : i1 + %11 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %12 = llvm.extractvalue %11[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %13 = llvm.getelementptr %12[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr + %14 = llvm.load %13 : !llvm.ptr -> !llvm.ptr + %15 = llvm.mul %8, %1 : i64 + %16 = llvm.udiv %15, %2 : i64 + %17 = llvm.getelementptr %14[%16] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.intr.assume %4 ["align"(%17, %3 : !llvm.ptr, i64)] : i1 + %18 = llvm.load %10 {alignment = 4 : i64} : !llvm.ptr -> vector<2xf32> + %19 = llvm.intr.fabs(%18) : (vector<2xf32>) -> vector<2xf32> + llvm.store %19, %17 {alignment = 4 : i64} : vector<2xf32>, !llvm.ptr + llvm.return %0 : i32 + } + } +} + +// -----// IR Dump After LinkTargetExecutablesPass (iree-hal-link-target-executables) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding], flags = Indirect> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module { + util.global private @__device_0 = #device_target_local + hal.executable private @multiple_results_dispatch_0 { + hal.executable.variant public @embedded_elf_arm_64 target(#executable_target_embedded_elf_arm_64) { + hal.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 ordinal(0) layout(#pipeline_layout) attributes {workgroup_size = [1 : index, 1 : index, 1 : index]} + builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", llvm.target_triple = "arm64-unknown-unknown-eabi-elf"} { + llvm.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}) -> i32 { + %0 = llvm.mlir.constant(0 : i32) : i32 + %1 = llvm.mlir.constant(8 : i64) : i64 + %2 = llvm.mlir.constant(32 : i64) : i64 + %3 = llvm.mlir.constant(64 : index) : i64 + %4 = llvm.mlir.constant(true) : i1 + %5 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %6 = llvm.extractvalue %5[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %7 = llvm.load %6 : !llvm.ptr -> i32 + %8 = llvm.zext %7 : i32 to i64 + %9 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %10 = llvm.load %9 : !llvm.ptr -> !llvm.ptr + llvm.intr.assume %4 ["align"(%10, %3 : !llvm.ptr, i64)] : i1 + %11 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %12 = llvm.extractvalue %11[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %13 = llvm.getelementptr %12[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr + %14 = llvm.load %13 : !llvm.ptr -> !llvm.ptr + %15 = llvm.mul %8, %1 : i64 + %16 = llvm.udiv %15, %2 : i64 + %17 = llvm.getelementptr %14[%16] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.intr.assume %4 ["align"(%17, %3 : !llvm.ptr, i64)] : i1 + %18 = llvm.load %10 {alignment = 4 : i64} : !llvm.ptr -> vector<2xf32> + %19 = llvm.intr.fabs(%18) : (vector<2xf32>) -> vector<2xf32> + llvm.store %19, %17 {alignment = 4 : i64} : vector<2xf32>, !llvm.ptr + llvm.return %0 : i32 + } + } + } + } + util.func private @__multiple_results_memoize_apply(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} { + %c3 = arith.constant 3 : index + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c128 = arith.constant 128 : index + %c64_i32 = arith.constant 64 : i32 + %c1 = arith.constant 1 : index + %cmd = hal.command_buffer.create device(%arg0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%arg1) bindings(%c3) : !hal.command_buffer + %exe = hal.executable.lookup device(%arg0 : !hal.device) executable(@multiple_results_dispatch_0) : !hal.executable + %ordinal = hal.executable.export.ordinal target(@multiple_results_dispatch_0::@embedded_elf_arm_64::@multiple_results_dispatch_0_elementwise_2_f32) : index + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([ + (%c0 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([ + (%c1 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") + hal.command_buffer.finalize<%cmd : !hal.command_buffer> + util.return %cmd : !hal.command_buffer + } + util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.initializer { + %c-1_i64 = arith.constant -1 : i64 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %0 = util.call @__multiple_results_memoize_apply(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer + util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.return + } + util.func private @__multiple_results_memoize_lookup(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer { + %0 = util.null : !hal.command_buffer + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %1 = util.cmp.eq %arg0, %__device_0 : !hal.device + %2 = scf.if %1 -> (!hal.command_buffer) { + %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + scf.yield %__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + } else { + scf.yield %0 : !hal.command_buffer + } + util.return %2 : !hal.command_buffer + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c64 = arith.constant 64 : index + %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index + %c-1_i64 = arith.constant -1 : i64 + %0 = util.null : !hal.fence + %c0_i64 = arith.constant 0 : i64 + %c-1_i32 = arith.constant -1 : i32 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer + %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator + hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer + hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + %memory_types, %buffer_usage = hal.allocator.resolve_memory_properties for(#hal.device.affinity<@__device_0>) lifetime(external) : i32, i32 + %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_types) usage(%buffer_usage) flags("None") : !hal.buffer{%c128} + %1 = util.call @__multiple_results_memoize_lookup(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer + %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%1) bindings([ + (%buffer : !hal.buffer)[%c0, %c8], + (%buffer_0 : !hal.buffer)[%c0, %c8], + (%transient_buffer : !hal.buffer)[%c0, %c128] + ]) flags("None") + %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32 + util.status.check_ok %status, "failed to wait on timepoint" + %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After SymbolDCE (symbol-dce) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding], flags = Indirect> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module { + util.global private @__device_0 = #device_target_local + hal.executable private @multiple_results_dispatch_0 { + hal.executable.variant public @embedded_elf_arm_64 target(#executable_target_embedded_elf_arm_64) { + hal.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 ordinal(0) layout(#pipeline_layout) attributes {workgroup_size = [1 : index, 1 : index, 1 : index]} + builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", llvm.target_triple = "arm64-unknown-unknown-eabi-elf"} { + llvm.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}) -> i32 { + %0 = llvm.mlir.constant(0 : i32) : i32 + %1 = llvm.mlir.constant(8 : i64) : i64 + %2 = llvm.mlir.constant(32 : i64) : i64 + %3 = llvm.mlir.constant(64 : index) : i64 + %4 = llvm.mlir.constant(true) : i1 + %5 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %6 = llvm.extractvalue %5[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %7 = llvm.load %6 : !llvm.ptr -> i32 + %8 = llvm.zext %7 : i32 to i64 + %9 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %10 = llvm.load %9 : !llvm.ptr -> !llvm.ptr + llvm.intr.assume %4 ["align"(%10, %3 : !llvm.ptr, i64)] : i1 + %11 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %12 = llvm.extractvalue %11[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %13 = llvm.getelementptr %12[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr + %14 = llvm.load %13 : !llvm.ptr -> !llvm.ptr + %15 = llvm.mul %8, %1 : i64 + %16 = llvm.udiv %15, %2 : i64 + %17 = llvm.getelementptr %14[%16] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.intr.assume %4 ["align"(%17, %3 : !llvm.ptr, i64)] : i1 + %18 = llvm.load %10 {alignment = 4 : i64} : !llvm.ptr -> vector<2xf32> + %19 = llvm.intr.fabs(%18) : (vector<2xf32>) -> vector<2xf32> + llvm.store %19, %17 {alignment = 4 : i64} : vector<2xf32>, !llvm.ptr + llvm.return %0 : i32 + } + } + } + } + util.func private @__multiple_results_memoize_apply(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} { + %c3 = arith.constant 3 : index + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c128 = arith.constant 128 : index + %c64_i32 = arith.constant 64 : i32 + %c1 = arith.constant 1 : index + %cmd = hal.command_buffer.create device(%arg0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%arg1) bindings(%c3) : !hal.command_buffer + %exe = hal.executable.lookup device(%arg0 : !hal.device) executable(@multiple_results_dispatch_0) : !hal.executable + %ordinal = hal.executable.export.ordinal target(@multiple_results_dispatch_0::@embedded_elf_arm_64::@multiple_results_dispatch_0_elementwise_2_f32) : index + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([ + (%c0 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([ + (%c1 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") + hal.command_buffer.finalize<%cmd : !hal.command_buffer> + util.return %cmd : !hal.command_buffer + } + util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.initializer { + %c-1_i64 = arith.constant -1 : i64 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %0 = util.call @__multiple_results_memoize_apply(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer + util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.return + } + util.func private @__multiple_results_memoize_lookup(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer { + %0 = util.null : !hal.command_buffer + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %1 = util.cmp.eq %arg0, %__device_0 : !hal.device + %2 = scf.if %1 -> (!hal.command_buffer) { + %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + scf.yield %__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + } else { + scf.yield %0 : !hal.command_buffer + } + util.return %2 : !hal.command_buffer + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c64 = arith.constant 64 : index + %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index + %c-1_i64 = arith.constant -1 : i64 + %0 = util.null : !hal.fence + %c0_i64 = arith.constant 0 : i64 + %c-1_i32 = arith.constant -1 : i32 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer + %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator + hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer + hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + %memory_types, %buffer_usage = hal.allocator.resolve_memory_properties for(#hal.device.affinity<@__device_0>) lifetime(external) : i32, i32 + %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_types) usage(%buffer_usage) flags("None") : !hal.buffer{%c128} + %1 = util.call @__multiple_results_memoize_lookup(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer + %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%1) bindings([ + (%buffer : !hal.buffer)[%c0, %c8], + (%buffer_0 : !hal.buffer)[%c0, %c8], + (%transient_buffer : !hal.buffer)[%c0, %c128] + ]) flags("None") + %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32 + util.status.check_ok %status, "failed to wait on timepoint" + %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After LinkAllExecutablesPass (iree-hal-link-all-executables) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding], flags = Indirect> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module { + util.global private @__device_0 = #device_target_local + hal.executable private @multiple_results_dispatch_0 { + hal.executable.variant public @embedded_elf_arm_64 target(#executable_target_embedded_elf_arm_64) { + hal.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 ordinal(0) layout(#pipeline_layout) attributes {workgroup_size = [1 : index, 1 : index, 1 : index]} + builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", llvm.target_triple = "arm64-unknown-unknown-eabi-elf"} { + llvm.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}) -> i32 { + %0 = llvm.mlir.constant(0 : i32) : i32 + %1 = llvm.mlir.constant(8 : i64) : i64 + %2 = llvm.mlir.constant(32 : i64) : i64 + %3 = llvm.mlir.constant(64 : index) : i64 + %4 = llvm.mlir.constant(true) : i1 + %5 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %6 = llvm.extractvalue %5[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %7 = llvm.load %6 : !llvm.ptr -> i32 + %8 = llvm.zext %7 : i32 to i64 + %9 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %10 = llvm.load %9 : !llvm.ptr -> !llvm.ptr + llvm.intr.assume %4 ["align"(%10, %3 : !llvm.ptr, i64)] : i1 + %11 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %12 = llvm.extractvalue %11[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %13 = llvm.getelementptr %12[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr + %14 = llvm.load %13 : !llvm.ptr -> !llvm.ptr + %15 = llvm.mul %8, %1 : i64 + %16 = llvm.udiv %15, %2 : i64 + %17 = llvm.getelementptr %14[%16] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.intr.assume %4 ["align"(%17, %3 : !llvm.ptr, i64)] : i1 + %18 = llvm.load %10 {alignment = 4 : i64} : !llvm.ptr -> vector<2xf32> + %19 = llvm.intr.fabs(%18) : (vector<2xf32>) -> vector<2xf32> + llvm.store %19, %17 {alignment = 4 : i64} : vector<2xf32>, !llvm.ptr + llvm.return %0 : i32 + } + } + } + } + util.func private @__multiple_results_memoize_apply(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} { + %c3 = arith.constant 3 : index + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c128 = arith.constant 128 : index + %c64_i32 = arith.constant 64 : i32 + %c1 = arith.constant 1 : index + %cmd = hal.command_buffer.create device(%arg0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%arg1) bindings(%c3) : !hal.command_buffer + %exe = hal.executable.lookup device(%arg0 : !hal.device) executable(@multiple_results_dispatch_0) : !hal.executable + %ordinal = hal.executable.export.ordinal target(@multiple_results_dispatch_0::@embedded_elf_arm_64::@multiple_results_dispatch_0_elementwise_2_f32) : index + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([ + (%c0 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([ + (%c1 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") + hal.command_buffer.finalize<%cmd : !hal.command_buffer> + util.return %cmd : !hal.command_buffer + } + util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.initializer { + %c-1_i64 = arith.constant -1 : i64 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %0 = util.call @__multiple_results_memoize_apply(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer + util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.return + } + util.func private @__multiple_results_memoize_lookup(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer { + %0 = util.null : !hal.command_buffer + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %1 = util.cmp.eq %arg0, %__device_0 : !hal.device + %2 = scf.if %1 -> (!hal.command_buffer) { + %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + scf.yield %__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + } else { + scf.yield %0 : !hal.command_buffer + } + util.return %2 : !hal.command_buffer + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c64 = arith.constant 64 : index + %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index + %c-1_i64 = arith.constant -1 : i64 + %0 = util.null : !hal.fence + %c0_i64 = arith.constant 0 : i64 + %c-1_i32 = arith.constant -1 : i32 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer + %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator + hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer + hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + %memory_types, %buffer_usage = hal.allocator.resolve_memory_properties for(#hal.device.affinity<@__device_0>) lifetime(external) : i32, i32 + %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_types) usage(%buffer_usage) flags("None") : !hal.buffer{%c128} + %1 = util.call @__multiple_results_memoize_lookup(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer + %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%1) bindings([ + (%buffer : !hal.buffer)[%c0, %c8], + (%buffer_0 : !hal.buffer)[%c0, %c8], + (%transient_buffer : !hal.buffer)[%c0, %c128] + ]) flags("None") + %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32 + util.status.check_ok %status, "failed to wait on timepoint" + %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After HoistExecutableObjectsPass (iree-hal-hoist-executable-objects) //----- // +hal.executable.variant public @embedded_elf_arm_64 target(<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>) { + hal.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 ordinal(0) layout(#hal.pipeline.layout, #hal.pipeline.binding], flags = Indirect>) attributes {workgroup_size = [1 : index, 1 : index, 1 : index]} + builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", llvm.target_triple = "arm64-unknown-unknown-eabi-elf"} { + llvm.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}) -> i32 { + %0 = llvm.mlir.constant(0 : i32) : i32 + %1 = llvm.mlir.constant(8 : i64) : i64 + %2 = llvm.mlir.constant(32 : i64) : i64 + %3 = llvm.mlir.constant(64 : index) : i64 + %4 = llvm.mlir.constant(true) : i1 + %5 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %6 = llvm.extractvalue %5[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %7 = llvm.load %6 : !llvm.ptr -> i32 + %8 = llvm.zext %7 : i32 to i64 + %9 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %10 = llvm.load %9 : !llvm.ptr -> !llvm.ptr + llvm.intr.assume %4 ["align"(%10, %3 : !llvm.ptr, i64)] : i1 + %11 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %12 = llvm.extractvalue %11[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %13 = llvm.getelementptr %12[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr + %14 = llvm.load %13 : !llvm.ptr -> !llvm.ptr + %15 = llvm.mul %8, %1 : i64 + %16 = llvm.udiv %15, %2 : i64 + %17 = llvm.getelementptr %14[%16] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.intr.assume %4 ["align"(%17, %3 : !llvm.ptr, i64)] : i1 + %18 = llvm.load %10 {alignment = 4 : i64} : !llvm.ptr -> vector<2xf32> + %19 = llvm.intr.fabs(%18) : (vector<2xf32>) -> vector<2xf32> + llvm.store %19, %17 {alignment = 4 : i64} : vector<2xf32>, !llvm.ptr + llvm.return %0 : i32 + } + } +} + +// -----// IR Dump After ResolveExportOrdinalsPass (iree-hal-resolve-export-ordinals) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding], flags = Indirect> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module { + util.global private @__device_0 = #device_target_local + hal.executable private @multiple_results_dispatch_0 { + hal.executable.variant public @embedded_elf_arm_64 target(#executable_target_embedded_elf_arm_64) { + hal.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 ordinal(0) layout(#pipeline_layout) attributes {workgroup_size = [1 : index, 1 : index, 1 : index]} + builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", llvm.target_triple = "arm64-unknown-unknown-eabi-elf"} { + llvm.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}) -> i32 { + %0 = llvm.mlir.constant(0 : i32) : i32 + %1 = llvm.mlir.constant(8 : i64) : i64 + %2 = llvm.mlir.constant(32 : i64) : i64 + %3 = llvm.mlir.constant(64 : index) : i64 + %4 = llvm.mlir.constant(true) : i1 + %5 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %6 = llvm.extractvalue %5[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %7 = llvm.load %6 : !llvm.ptr -> i32 + %8 = llvm.zext %7 : i32 to i64 + %9 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %10 = llvm.load %9 : !llvm.ptr -> !llvm.ptr + llvm.intr.assume %4 ["align"(%10, %3 : !llvm.ptr, i64)] : i1 + %11 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %12 = llvm.extractvalue %11[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %13 = llvm.getelementptr %12[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr + %14 = llvm.load %13 : !llvm.ptr -> !llvm.ptr + %15 = llvm.mul %8, %1 : i64 + %16 = llvm.udiv %15, %2 : i64 + %17 = llvm.getelementptr %14[%16] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.intr.assume %4 ["align"(%17, %3 : !llvm.ptr, i64)] : i1 + %18 = llvm.load %10 {alignment = 4 : i64} : !llvm.ptr -> vector<2xf32> + %19 = llvm.intr.fabs(%18) : (vector<2xf32>) -> vector<2xf32> + llvm.store %19, %17 {alignment = 4 : i64} : vector<2xf32>, !llvm.ptr + llvm.return %0 : i32 + } + } + } + } + util.func private @__multiple_results_memoize_apply(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} { + %c3 = arith.constant 3 : index + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c128 = arith.constant 128 : index + %c64_i32 = arith.constant 64 : i32 + %c1 = arith.constant 1 : index + %cmd = hal.command_buffer.create device(%arg0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%arg1) bindings(%c3) : !hal.command_buffer + %exe = hal.executable.lookup device(%arg0 : !hal.device) executable(@multiple_results_dispatch_0) : !hal.executable + %c0_0 = arith.constant 0 : index + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%c0_0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([ + (%c0 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%c0_0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([ + (%c1 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") + hal.command_buffer.finalize<%cmd : !hal.command_buffer> + util.return %cmd : !hal.command_buffer + } + util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.initializer { + %c-1_i64 = arith.constant -1 : i64 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %0 = util.call @__multiple_results_memoize_apply(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer + util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.return + } + util.func private @__multiple_results_memoize_lookup(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer { + %0 = util.null : !hal.command_buffer + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %1 = util.cmp.eq %arg0, %__device_0 : !hal.device + %2 = scf.if %1 -> (!hal.command_buffer) { + %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + scf.yield %__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + } else { + scf.yield %0 : !hal.command_buffer + } + util.return %2 : !hal.command_buffer + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c64 = arith.constant 64 : index + %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index + %c-1_i64 = arith.constant -1 : i64 + %0 = util.null : !hal.fence + %c0_i64 = arith.constant 0 : i64 + %c-1_i32 = arith.constant -1 : i32 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer + %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator + hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer + hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + %memory_types, %buffer_usage = hal.allocator.resolve_memory_properties for(#hal.device.affinity<@__device_0>) lifetime(external) : i32, i32 + %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_types) usage(%buffer_usage) flags("None") : !hal.buffer{%c128} + %1 = util.call @__multiple_results_memoize_lookup(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer + %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%1) bindings([ + (%buffer : !hal.buffer)[%c0, %c8], + (%buffer_0 : !hal.buffer)[%c0, %c8], + (%transient_buffer : !hal.buffer)[%c0, %c128] + ]) flags("None") + %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32 + util.status.check_ok %status, "failed to wait on timepoint" + %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After MaterializeResourceCachesPass (iree-hal-materialize-resource-caches) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding], flags = Indirect> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module { + util.global private @__device_0 = #device_target_local + util.global private @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.initializer { + %__device_0 = util.global.load @__device_0 : !hal.device + %c-1_i64 = arith.constant -1 : i64 + %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + %c-1 = arith.constant -1 : index + %c0 = arith.constant 0 : index + %0 = arith.select %value, %c0, %c-1 : index + %1 = scf.index_switch %0 -> !hal.executable + case 0 { + %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable + scf.yield %executable : !hal.executable + } + default { + %c14_i32 = arith.constant 14 : i32 + util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + %2 = util.null : !hal.executable + scf.yield %2 : !hal.executable + } + util.global.store %1, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.return + } + hal.executable private @multiple_results_dispatch_0 { + hal.executable.variant public @embedded_elf_arm_64 target(#executable_target_embedded_elf_arm_64) { + hal.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 ordinal(0) layout(#pipeline_layout) attributes {workgroup_size = [1 : index, 1 : index, 1 : index]} + builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", llvm.target_triple = "arm64-unknown-unknown-eabi-elf"} { + llvm.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}) -> i32 { + %0 = llvm.mlir.constant(0 : i32) : i32 + %1 = llvm.mlir.constant(8 : i64) : i64 + %2 = llvm.mlir.constant(32 : i64) : i64 + %3 = llvm.mlir.constant(64 : index) : i64 + %4 = llvm.mlir.constant(true) : i1 + %5 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %6 = llvm.extractvalue %5[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %7 = llvm.load %6 : !llvm.ptr -> i32 + %8 = llvm.zext %7 : i32 to i64 + %9 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %10 = llvm.load %9 : !llvm.ptr -> !llvm.ptr + llvm.intr.assume %4 ["align"(%10, %3 : !llvm.ptr, i64)] : i1 + %11 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %12 = llvm.extractvalue %11[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %13 = llvm.getelementptr %12[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr + %14 = llvm.load %13 : !llvm.ptr -> !llvm.ptr + %15 = llvm.mul %8, %1 : i64 + %16 = llvm.udiv %15, %2 : i64 + %17 = llvm.getelementptr %14[%16] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.intr.assume %4 ["align"(%17, %3 : !llvm.ptr, i64)] : i1 + %18 = llvm.load %10 {alignment = 4 : i64} : !llvm.ptr -> vector<2xf32> + %19 = llvm.intr.fabs(%18) : (vector<2xf32>) -> vector<2xf32> + llvm.store %19, %17 {alignment = 4 : i64} : vector<2xf32>, !llvm.ptr + llvm.return %0 : i32 + } + } + } + } + util.func private @__multiple_results_memoize_apply(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} { + %c3 = arith.constant 3 : index + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c128 = arith.constant 128 : index + %c64_i32 = arith.constant 64 : i32 + %c1 = arith.constant 1 : index + %cmd = hal.command_buffer.create device(%arg0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%arg1) bindings(%c3) : !hal.command_buffer + %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + %c0_0 = arith.constant 0 : index + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0_0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([ + (%c0 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0_0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([ + (%c1 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") + hal.command_buffer.finalize<%cmd : !hal.command_buffer> + util.return %cmd : !hal.command_buffer + } + util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.initializer { + %c-1_i64 = arith.constant -1 : i64 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %0 = util.call @__multiple_results_memoize_apply(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer + util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.return + } + util.func private @__multiple_results_memoize_lookup(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer { + %0 = util.null : !hal.command_buffer + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %1 = util.cmp.eq %arg0, %__device_0 : !hal.device + %2 = scf.if %1 -> (!hal.command_buffer) { + %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + scf.yield %__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + } else { + scf.yield %0 : !hal.command_buffer + } + util.return %2 : !hal.command_buffer + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c64 = arith.constant 64 : index + %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index + %c-1_i64 = arith.constant -1 : i64 + %0 = util.null : !hal.fence + %c0_i64 = arith.constant 0 : i64 + %c-1_i32 = arith.constant -1 : i32 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer + %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator + hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer + hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + %memory_types, %buffer_usage = hal.allocator.resolve_memory_properties for(#hal.device.affinity<@__device_0>) lifetime(external) : i32, i32 + %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_types) usage(%buffer_usage) flags("None") : !hal.buffer{%c128} + %1 = util.call @__multiple_results_memoize_lookup(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer + %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%1) bindings([ + (%buffer : !hal.buffer)[%c0, %c8], + (%buffer_0 : !hal.buffer)[%c0, %c8], + (%transient_buffer : !hal.buffer)[%c0, %c128] + ]) flags("None") + %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32 + util.status.check_ok %status, "failed to wait on timepoint" + %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After ResolveTopologyQueriesPass (iree-hal-resolve-topology-queries) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding], flags = Indirect> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module { + util.global private @__device_0 = #device_target_local + util.global private @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.initializer { + %__device_0 = util.global.load @__device_0 : !hal.device + %c-1_i64 = arith.constant -1 : i64 + %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + %c-1 = arith.constant -1 : index + %c0 = arith.constant 0 : index + %0 = arith.select %value, %c0, %c-1 : index + %1 = scf.index_switch %0 -> !hal.executable + case 0 { + %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable + scf.yield %executable : !hal.executable + } + default { + %c14_i32 = arith.constant 14 : i32 + util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + %2 = util.null : !hal.executable + scf.yield %2 : !hal.executable + } + util.global.store %1, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.return + } + hal.executable private @multiple_results_dispatch_0 { + hal.executable.variant public @embedded_elf_arm_64 target(#executable_target_embedded_elf_arm_64) { + hal.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 ordinal(0) layout(#pipeline_layout) attributes {workgroup_size = [1 : index, 1 : index, 1 : index]} + builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", llvm.target_triple = "arm64-unknown-unknown-eabi-elf"} { + llvm.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}) -> i32 { + %0 = llvm.mlir.constant(0 : i32) : i32 + %1 = llvm.mlir.constant(8 : i64) : i64 + %2 = llvm.mlir.constant(32 : i64) : i64 + %3 = llvm.mlir.constant(64 : index) : i64 + %4 = llvm.mlir.constant(true) : i1 + %5 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %6 = llvm.extractvalue %5[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %7 = llvm.load %6 : !llvm.ptr -> i32 + %8 = llvm.zext %7 : i32 to i64 + %9 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %10 = llvm.load %9 : !llvm.ptr -> !llvm.ptr + llvm.intr.assume %4 ["align"(%10, %3 : !llvm.ptr, i64)] : i1 + %11 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %12 = llvm.extractvalue %11[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %13 = llvm.getelementptr %12[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr + %14 = llvm.load %13 : !llvm.ptr -> !llvm.ptr + %15 = llvm.mul %8, %1 : i64 + %16 = llvm.udiv %15, %2 : i64 + %17 = llvm.getelementptr %14[%16] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.intr.assume %4 ["align"(%17, %3 : !llvm.ptr, i64)] : i1 + %18 = llvm.load %10 {alignment = 4 : i64} : !llvm.ptr -> vector<2xf32> + %19 = llvm.intr.fabs(%18) : (vector<2xf32>) -> vector<2xf32> + llvm.store %19, %17 {alignment = 4 : i64} : vector<2xf32>, !llvm.ptr + llvm.return %0 : i32 + } + } + } + } + util.func private @__multiple_results_memoize_apply(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} { + %c3 = arith.constant 3 : index + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c128 = arith.constant 128 : index + %c64_i32 = arith.constant 64 : i32 + %c1 = arith.constant 1 : index + %cmd = hal.command_buffer.create device(%arg0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%arg1) bindings(%c3) : !hal.command_buffer + %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + %c0_0 = arith.constant 0 : index + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0_0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([ + (%c0 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0_0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([ + (%c1 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") + hal.command_buffer.finalize<%cmd : !hal.command_buffer> + util.return %cmd : !hal.command_buffer + } + util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.initializer { + %c-1_i64 = arith.constant -1 : i64 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %0 = util.call @__multiple_results_memoize_apply(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer + util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.return + } + util.func private @__multiple_results_memoize_lookup(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer { + %0 = util.null : !hal.command_buffer + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %1 = util.cmp.eq %arg0, %__device_0 : !hal.device + %2 = scf.if %1 -> (!hal.command_buffer) { + %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + scf.yield %__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + } else { + scf.yield %0 : !hal.command_buffer + } + util.return %2 : !hal.command_buffer + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c64 = arith.constant 64 : index + %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index + %c-1_i64 = arith.constant -1 : i64 + %0 = util.null : !hal.fence + %c0_i64 = arith.constant 0 : i64 + %c-1_i32 = arith.constant -1 : i32 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer + %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator + hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer + hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32 + %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32 + %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128} + %1 = util.call @__multiple_results_memoize_lookup(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer + %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%1) bindings([ + (%buffer : !hal.buffer)[%c0, %c8], + (%buffer_0 : !hal.buffer)[%c0, %c8], + (%transient_buffer : !hal.buffer)[%c0, %c128] + ]) flags("None") + %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32 + util.status.check_ok %status, "failed to wait on timepoint" + %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After MemoizeDeviceSelectionPass (iree-hal-memoize-device-selection) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding], flags = Indirect> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module { + util.global private @__device_0 = #device_target_local + util.global private @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.initializer { + %__device_0 = util.global.load @__device_0 : !hal.device + %c-1_i64 = arith.constant -1 : i64 + %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + %c-1 = arith.constant -1 : index + %c0 = arith.constant 0 : index + %0 = arith.select %value, %c0, %c-1 : index + %1 = scf.index_switch %0 -> !hal.executable + case 0 { + %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable + scf.yield %executable : !hal.executable + } + default { + %c14_i32 = arith.constant 14 : i32 + util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + %2 = util.null : !hal.executable + scf.yield %2 : !hal.executable + } + util.global.store %1, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.return + } + hal.executable private @multiple_results_dispatch_0 { + hal.executable.variant public @embedded_elf_arm_64 target(#executable_target_embedded_elf_arm_64) { + hal.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 ordinal(0) layout(#pipeline_layout) attributes {workgroup_size = [1 : index, 1 : index, 1 : index]} + builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", llvm.target_triple = "arm64-unknown-unknown-eabi-elf"} { + llvm.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}) -> i32 { + %0 = llvm.mlir.constant(0 : i32) : i32 + %1 = llvm.mlir.constant(8 : i64) : i64 + %2 = llvm.mlir.constant(32 : i64) : i64 + %3 = llvm.mlir.constant(64 : index) : i64 + %4 = llvm.mlir.constant(true) : i1 + %5 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %6 = llvm.extractvalue %5[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %7 = llvm.load %6 : !llvm.ptr -> i32 + %8 = llvm.zext %7 : i32 to i64 + %9 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %10 = llvm.load %9 : !llvm.ptr -> !llvm.ptr + llvm.intr.assume %4 ["align"(%10, %3 : !llvm.ptr, i64)] : i1 + %11 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %12 = llvm.extractvalue %11[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %13 = llvm.getelementptr %12[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr + %14 = llvm.load %13 : !llvm.ptr -> !llvm.ptr + %15 = llvm.mul %8, %1 : i64 + %16 = llvm.udiv %15, %2 : i64 + %17 = llvm.getelementptr %14[%16] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.intr.assume %4 ["align"(%17, %3 : !llvm.ptr, i64)] : i1 + %18 = llvm.load %10 {alignment = 4 : i64} : !llvm.ptr -> vector<2xf32> + %19 = llvm.intr.fabs(%18) : (vector<2xf32>) -> vector<2xf32> + llvm.store %19, %17 {alignment = 4 : i64} : vector<2xf32>, !llvm.ptr + llvm.return %0 : i32 + } + } + } + } + util.func private @__multiple_results_memoize_apply(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} { + %c3 = arith.constant 3 : index + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c128 = arith.constant 128 : index + %c64_i32 = arith.constant 64 : i32 + %c1 = arith.constant 1 : index + %cmd = hal.command_buffer.create device(%arg0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%arg1) bindings(%c3) : !hal.command_buffer + %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + %c0_0 = arith.constant 0 : index + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0_0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([ + (%c0 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0_0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([ + (%c1 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") + hal.command_buffer.finalize<%cmd : !hal.command_buffer> + util.return %cmd : !hal.command_buffer + } + util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.initializer { + %c-1_i64 = arith.constant -1 : i64 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %0 = util.call @__multiple_results_memoize_apply(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer + util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.return + } + util.func private @__multiple_results_memoize_lookup(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer { + %0 = util.null : !hal.command_buffer + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %1 = util.cmp.eq %arg0, %__device_0 : !hal.device + %2 = scf.if %1 -> (!hal.command_buffer) { + %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + scf.yield %__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + } else { + scf.yield %0 : !hal.command_buffer + } + util.return %2 : !hal.command_buffer + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c64 = arith.constant 64 : index + %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index + %c-1_i64 = arith.constant -1 : i64 + %0 = util.null : !hal.fence + %c0_i64 = arith.constant 0 : i64 + %c-1_i32 = arith.constant -1 : i32 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer + %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator + hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer + hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32 + %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32 + %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128} + %1 = util.call @__multiple_results_memoize_lookup(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer + %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%1) bindings([ + (%buffer : !hal.buffer)[%c0, %c8], + (%buffer_0 : !hal.buffer)[%c0, %c8], + (%transient_buffer : !hal.buffer)[%c0, %c128] + ]) flags("None") + %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32 + util.status.check_ok %status, "failed to wait on timepoint" + %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After MemoizeDeviceQueriesPass (iree-hal-memoize-device-queries) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding], flags = Indirect> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module { + util.global private @__device_0 = #device_target_local + util.global private @__device_0_query_0_hal_executable_format_embedded_elf_arm_64_ok : i1 + util.global private @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + util.initializer { + %__device_0 = util.global.load @__device_0 : !hal.device + %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + util.global.store %ok, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64_ok : i1 + util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + util.return + } + util.global private @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.initializer { + %__device_0 = util.global.load @__device_0 : !hal.device + %c-1_i64 = arith.constant -1 : i64 + %__device_0_query_0_hal_executable_format_embedded_elf_arm_64_ok = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64_ok : i1 + %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + %c-1 = arith.constant -1 : index + %c0 = arith.constant 0 : index + %0 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index + %1 = scf.index_switch %0 -> !hal.executable + case 0 { + %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable + scf.yield %executable : !hal.executable + } + default { + %c14_i32 = arith.constant 14 : i32 + util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + %2 = util.null : !hal.executable + scf.yield %2 : !hal.executable + } + util.global.store %1, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.return + } + hal.executable private @multiple_results_dispatch_0 { + hal.executable.variant public @embedded_elf_arm_64 target(#executable_target_embedded_elf_arm_64) { + hal.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 ordinal(0) layout(#pipeline_layout) attributes {workgroup_size = [1 : index, 1 : index, 1 : index]} + builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", llvm.target_triple = "arm64-unknown-unknown-eabi-elf"} { + llvm.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}) -> i32 { + %0 = llvm.mlir.constant(0 : i32) : i32 + %1 = llvm.mlir.constant(8 : i64) : i64 + %2 = llvm.mlir.constant(32 : i64) : i64 + %3 = llvm.mlir.constant(64 : index) : i64 + %4 = llvm.mlir.constant(true) : i1 + %5 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %6 = llvm.extractvalue %5[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %7 = llvm.load %6 : !llvm.ptr -> i32 + %8 = llvm.zext %7 : i32 to i64 + %9 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %10 = llvm.load %9 : !llvm.ptr -> !llvm.ptr + llvm.intr.assume %4 ["align"(%10, %3 : !llvm.ptr, i64)] : i1 + %11 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %12 = llvm.extractvalue %11[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %13 = llvm.getelementptr %12[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr + %14 = llvm.load %13 : !llvm.ptr -> !llvm.ptr + %15 = llvm.mul %8, %1 : i64 + %16 = llvm.udiv %15, %2 : i64 + %17 = llvm.getelementptr %14[%16] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.intr.assume %4 ["align"(%17, %3 : !llvm.ptr, i64)] : i1 + %18 = llvm.load %10 {alignment = 4 : i64} : !llvm.ptr -> vector<2xf32> + %19 = llvm.intr.fabs(%18) : (vector<2xf32>) -> vector<2xf32> + llvm.store %19, %17 {alignment = 4 : i64} : vector<2xf32>, !llvm.ptr + llvm.return %0 : i32 + } + } + } + } + util.func private @__multiple_results_memoize_apply(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} { + %c3 = arith.constant 3 : index + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c128 = arith.constant 128 : index + %c64_i32 = arith.constant 64 : i32 + %c1 = arith.constant 1 : index + %cmd = hal.command_buffer.create device(%arg0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%arg1) bindings(%c3) : !hal.command_buffer + %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + %c0_0 = arith.constant 0 : index + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0_0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([ + (%c0 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0_0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([ + (%c1 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") + hal.command_buffer.finalize<%cmd : !hal.command_buffer> + util.return %cmd : !hal.command_buffer + } + util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.initializer { + %c-1_i64 = arith.constant -1 : i64 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %0 = util.call @__multiple_results_memoize_apply(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer + util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.return + } + util.func private @__multiple_results_memoize_lookup(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer { + %0 = util.null : !hal.command_buffer + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %1 = util.cmp.eq %arg0, %__device_0 : !hal.device + %2 = scf.if %1 -> (!hal.command_buffer) { + %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + scf.yield %__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + } else { + scf.yield %0 : !hal.command_buffer + } + util.return %2 : !hal.command_buffer + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c64 = arith.constant 64 : index + %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index + %c-1_i64 = arith.constant -1 : i64 + %0 = util.null : !hal.fence + %c0_i64 = arith.constant 0 : i64 + %c-1_i32 = arith.constant -1 : i32 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer + %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator + hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer + hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32 + %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32 + %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128} + %1 = util.call @__multiple_results_memoize_lookup(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer + %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%1) bindings([ + (%buffer : !hal.buffer)[%c0, %c8], + (%buffer_0 : !hal.buffer)[%c0, %c8], + (%transient_buffer : !hal.buffer)[%c0, %c128] + ]) flags("None") + %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32 + util.status.check_ok %status, "failed to wait on timepoint" + %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After Canonicalizer (canonicalize) //----- // +util.initializer { + %c-1_i64 = arith.constant -1 : i64 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %0 = util.call @__multiple_results_memoize_apply(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer + util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.return +} + +// -----// IR Dump After CSE (cse) //----- // +util.initializer { + %c-1_i64 = arith.constant -1 : i64 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %0 = util.call @__multiple_results_memoize_apply(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer + util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.return +} + +// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // +util.initializer { + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %c-1_i64 = arith.constant -1 : i64 + %0 = util.call @__multiple_results_memoize_apply(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer + util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.return +} + +// -----// IR Dump After Canonicalizer (canonicalize) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32 + %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32 + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c64 = arith.constant 64 : index + %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index + %c-1_i64 = arith.constant -1 : i64 + %0 = util.null : !hal.fence + %c0_i64 = arith.constant 0 : i64 + %c-1_i32 = arith.constant -1 : i32 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer + %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator + hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer + hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128} + %1 = util.call @__multiple_results_memoize_lookup(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer + %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%1) bindings([ + (%buffer : !hal.buffer)[%c0, %c8], + (%buffer_0 : !hal.buffer)[%c0, %c8], + (%transient_buffer : !hal.buffer)[%c0, %c128] + ]) flags("None") + %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32 + util.status.check_ok %status, "failed to wait on timepoint" + %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After Canonicalizer (canonicalize) //----- // +util.initializer { + %0 = util.null : !hal.executable + %c14_i32 = arith.constant 14 : i32 + %c0 = arith.constant 0 : index + %c-1 = arith.constant -1 : index + %c-1_i64 = arith.constant -1 : i64 + %__device_0 = util.global.load @__device_0 : !hal.device + %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index + %2 = scf.index_switch %1 -> !hal.executable + case 0 { + %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable + scf.yield %executable : !hal.executable + } + default { + util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + scf.yield %0 : !hal.executable + } + util.global.store %2, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.return +} + +// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // +util.initializer { + %c-1_i64 = arith.constant -1 : i64 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %0 = util.call @__multiple_results_memoize_apply(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer + util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.return +} + +// -----// IR Dump After Canonicalizer (canonicalize) //----- // +util.func private @__multiple_results_memoize_apply(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} { + %c3 = arith.constant 3 : index + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c128 = arith.constant 128 : index + %c64_i32 = arith.constant 64 : i32 + %c1 = arith.constant 1 : index + %cmd = hal.command_buffer.create device(%arg0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%arg1) bindings(%c3) : !hal.command_buffer + %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([ + (%c0 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([ + (%c1 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") + hal.command_buffer.finalize<%cmd : !hal.command_buffer> + util.return %cmd : !hal.command_buffer +} + +// -----// IR Dump After CSE (cse) //----- // +util.func private @__multiple_results_memoize_apply(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} { + %c3 = arith.constant 3 : index + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c128 = arith.constant 128 : index + %c64_i32 = arith.constant 64 : i32 + %c1 = arith.constant 1 : index + %cmd = hal.command_buffer.create device(%arg0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%arg1) bindings(%c3) : !hal.command_buffer + %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([ + (%c0 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([ + (%c1 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") + hal.command_buffer.finalize<%cmd : !hal.command_buffer> + util.return %cmd : !hal.command_buffer +} + +// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // +util.func private @__multiple_results_memoize_apply(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} { + %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + %c3 = arith.constant 3 : index + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c128 = arith.constant 128 : index + %c64_i32 = arith.constant 64 : i32 + %c1 = arith.constant 1 : index + %cmd = hal.command_buffer.create device(%arg0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%arg1) bindings(%c3) : !hal.command_buffer + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([ + (%c0 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([ + (%c1 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") + hal.command_buffer.finalize<%cmd : !hal.command_buffer> + util.return %cmd : !hal.command_buffer +} + +// -----// IR Dump After Canonicalizer (canonicalize) //----- // +util.initializer { + %__device_0 = util.global.load @__device_0 : !hal.device + %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + util.global.store %ok, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64_ok : i1 + util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + util.return +} + +// -----// IR Dump After CSE (cse) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32 + %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32 + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c64 = arith.constant 64 : index + %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index + %c-1_i64 = arith.constant -1 : i64 + %0 = util.null : !hal.fence + %c0_i64 = arith.constant 0 : i64 + %c-1_i32 = arith.constant -1 : i32 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer + %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator + hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer + hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128} + %1 = util.call @__multiple_results_memoize_lookup(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer + %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%1) bindings([ + (%buffer : !hal.buffer)[%c0, %c8], + (%buffer_0 : !hal.buffer)[%c0, %c8], + (%transient_buffer : !hal.buffer)[%c0, %c128] + ]) flags("None") + %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32 + util.status.check_ok %status, "failed to wait on timepoint" + %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After CSE (cse) //----- // +util.initializer { + %0 = util.null : !hal.executable + %c14_i32 = arith.constant 14 : i32 + %c0 = arith.constant 0 : index + %c-1 = arith.constant -1 : index + %c-1_i64 = arith.constant -1 : i64 + %__device_0 = util.global.load @__device_0 : !hal.device + %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index + %2 = scf.index_switch %1 -> !hal.executable + case 0 { + %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable + scf.yield %executable : !hal.executable + } + default { + util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + scf.yield %0 : !hal.executable + } + util.global.store %2, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.return +} + +// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // +util.func private @__multiple_results_memoize_apply(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} { + %c1 = arith.constant 1 : index + %c64_i32 = arith.constant 64 : i32 + %c128 = arith.constant 128 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %c0 = arith.constant 0 : index + %c0_i32 = arith.constant 0 : i32 + %c3 = arith.constant 3 : index + %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + %cmd = hal.command_buffer.create device(%arg0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%arg1) bindings(%c3) : !hal.command_buffer + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([ + (%c0 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([ + (%c1 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") + hal.command_buffer.finalize<%cmd : !hal.command_buffer> + util.return %cmd : !hal.command_buffer +} + +// -----// IR Dump After CSE (cse) //----- // +util.initializer { + %__device_0 = util.global.load @__device_0 : !hal.device + %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + util.global.store %ok, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64_ok : i1 + util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + util.return +} + +// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // +util.initializer { + %__device_0 = util.global.load @__device_0 : !hal.device + %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + util.global.store %ok, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64_ok : i1 + util.return +} + +// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32 + %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32 + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c64 = arith.constant 64 : index + %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index + %c-1_i64 = arith.constant -1 : i64 + %0 = util.null : !hal.fence + %c0_i64 = arith.constant 0 : i64 + %c-1_i32 = arith.constant -1 : i32 + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer + %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator + hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer + hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128} + %1 = util.call @__multiple_results_memoize_lookup(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer + %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%1) bindings([ + (%buffer : !hal.buffer)[%c0, %c8], + (%buffer_0 : !hal.buffer)[%c0, %c8], + (%transient_buffer : !hal.buffer)[%c0, %c128] + ]) flags("None") + %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32 + util.status.check_ok %status, "failed to wait on timepoint" + %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // +util.initializer { + %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + %__device_0 = util.global.load @__device_0 : !hal.device + %0 = util.null : !hal.executable + %c14_i32 = arith.constant 14 : i32 + %c0 = arith.constant 0 : index + %c-1 = arith.constant -1 : index + %c-1_i64 = arith.constant -1 : i64 + %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index + %2 = scf.index_switch %1 -> !hal.executable + case 0 { + %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable + scf.yield %executable : !hal.executable + } + default { + util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + scf.yield %0 : !hal.executable + } + util.global.store %2, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.return +} + +// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // +util.initializer { + %__device_0 = util.global.load @__device_0 : !hal.device + %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + util.global.store %ok, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64_ok : i1 + util.return +} + +// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c-1_i32 = arith.constant -1 : i32 + %c0_i64 = arith.constant 0 : i64 + %0 = util.null : !hal.fence + %c-1_i64 = arith.constant -1 : i64 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32 + %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer + %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator + hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer + hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128} + %1 = util.call @__multiple_results_memoize_lookup(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer + %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%1) bindings([ + (%buffer : !hal.buffer)[%c0, %c8], + (%buffer_0 : !hal.buffer)[%c0, %c8], + (%transient_buffer : !hal.buffer)[%c0, %c128] + ]) flags("None") + %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32 + util.status.check_ok %status, "failed to wait on timepoint" + %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After Canonicalizer (canonicalize) //----- // +util.func private @__multiple_results_memoize_lookup(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer { + %0 = util.null : !hal.command_buffer + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %1 = util.cmp.eq %arg0, %__device_0 : !hal.device + %2 = scf.if %1 -> (!hal.command_buffer) { + %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + scf.yield %__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + } else { + scf.yield %0 : !hal.command_buffer + } + util.return %2 : !hal.command_buffer +} + +// -----// IR Dump After CSE (cse) //----- // +util.func private @__multiple_results_memoize_lookup(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer { + %0 = util.null : !hal.command_buffer + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %1 = util.cmp.eq %arg0, %__device_0 : !hal.device + %2 = scf.if %1 -> (!hal.command_buffer) { + %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + scf.yield %__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + } else { + scf.yield %0 : !hal.command_buffer + } + util.return %2 : !hal.command_buffer +} + +// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // +util.initializer { + %c-1_i64 = arith.constant -1 : i64 + %c-1 = arith.constant -1 : index + %c0 = arith.constant 0 : index + %c14_i32 = arith.constant 14 : i32 + %0 = util.null : !hal.executable + %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + %__device_0 = util.global.load @__device_0 : !hal.device + %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index + %2 = arith.cmpi eq, %1, %c0 : index + %3 = scf.if %2 -> (!hal.executable) { + %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable + scf.yield %executable : !hal.executable + } else { + util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + scf.yield %0 : !hal.executable + } + util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.return +} + +// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // +util.func private @__multiple_results_memoize_lookup(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer { + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %0 = util.null : !hal.command_buffer + %1 = util.cmp.eq %arg0, %__device_0 : !hal.device + %2 = scf.if %1 -> (!hal.command_buffer) { + %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + scf.yield %__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + } else { + scf.yield %0 : !hal.command_buffer + } + util.return %2 : !hal.command_buffer +} + +// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // +util.func private @__multiple_results_memoize_lookup(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer { + %0 = util.null : !hal.command_buffer + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %1 = util.cmp.eq %arg0, %__device_0 : !hal.device + %2 = scf.if %1 -> (!hal.command_buffer) { + %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + scf.yield %__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + } else { + scf.yield %0 : !hal.command_buffer + } + util.return %2 : !hal.command_buffer +} + +// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding], flags = Indirect> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module { + util.global private @__device_0 = #device_target_local + util.global private @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + util.initializer { + %__device_0 = util.global.load @__device_0 : !hal.device + %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + util.return + } + util.global private @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.initializer { + %c-1_i64 = arith.constant -1 : i64 + %c-1 = arith.constant -1 : index + %c0 = arith.constant 0 : index + %c14_i32 = arith.constant 14 : i32 + %0 = util.null : !hal.executable + %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + %__device_0 = util.global.load @__device_0 : !hal.device + %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index + %2 = arith.cmpi eq, %1, %c0 : index + %3 = scf.if %2 -> (!hal.executable) { + %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable + scf.yield %executable : !hal.executable + } else { + util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + scf.yield %0 : !hal.executable + } + util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.return + } + hal.executable private @multiple_results_dispatch_0 { + hal.executable.variant public @embedded_elf_arm_64 target(#executable_target_embedded_elf_arm_64) { + hal.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 ordinal(0) layout(#pipeline_layout) attributes {workgroup_size = [1 : index, 1 : index, 1 : index]} + builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", llvm.target_triple = "arm64-unknown-unknown-eabi-elf"} { + llvm.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}) -> i32 { + %0 = llvm.mlir.constant(0 : i32) : i32 + %1 = llvm.mlir.constant(8 : i64) : i64 + %2 = llvm.mlir.constant(32 : i64) : i64 + %3 = llvm.mlir.constant(64 : index) : i64 + %4 = llvm.mlir.constant(true) : i1 + %5 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %6 = llvm.extractvalue %5[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %7 = llvm.load %6 : !llvm.ptr -> i32 + %8 = llvm.zext %7 : i32 to i64 + %9 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %10 = llvm.load %9 : !llvm.ptr -> !llvm.ptr + llvm.intr.assume %4 ["align"(%10, %3 : !llvm.ptr, i64)] : i1 + %11 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %12 = llvm.extractvalue %11[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %13 = llvm.getelementptr %12[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr + %14 = llvm.load %13 : !llvm.ptr -> !llvm.ptr + %15 = llvm.mul %8, %1 : i64 + %16 = llvm.udiv %15, %2 : i64 + %17 = llvm.getelementptr %14[%16] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.intr.assume %4 ["align"(%17, %3 : !llvm.ptr, i64)] : i1 + %18 = llvm.load %10 {alignment = 4 : i64} : !llvm.ptr -> vector<2xf32> + %19 = llvm.intr.fabs(%18) : (vector<2xf32>) -> vector<2xf32> + llvm.store %19, %17 {alignment = 4 : i64} : vector<2xf32>, !llvm.ptr + llvm.return %0 : i32 + } + } + } + } + util.func private @__multiple_results_memoize_apply(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} { + %c1 = arith.constant 1 : index + %c64_i32 = arith.constant 64 : i32 + %c128 = arith.constant 128 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %c0 = arith.constant 0 : index + %c0_i32 = arith.constant 0 : i32 + %c3 = arith.constant 3 : index + %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + %cmd = hal.command_buffer.create device(%arg0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%arg1) bindings(%c3) : !hal.command_buffer + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([ + (%c0 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([ + (%c1 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") + hal.command_buffer.finalize<%cmd : !hal.command_buffer> + util.return %cmd : !hal.command_buffer + } + util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.initializer { + %c-1_i64 = arith.constant -1 : i64 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %0 = util.call @__multiple_results_memoize_apply(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer + util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.return + } + util.func private @__multiple_results_memoize_lookup(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer { + %0 = util.null : !hal.command_buffer + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %1 = util.cmp.eq %arg0, %__device_0 : !hal.device + %2 = scf.if %1 -> (!hal.command_buffer) { + %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + scf.yield %__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + } else { + scf.yield %0 : !hal.command_buffer + } + util.return %2 : !hal.command_buffer + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c-1_i32 = arith.constant -1 : i32 + %c0_i64 = arith.constant 0 : i64 + %0 = util.null : !hal.fence + %c-1_i64 = arith.constant -1 : i64 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32 + %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer + %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator + hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer + hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128} + %1 = util.call @__multiple_results_memoize_lookup(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer + %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%1) bindings([ + (%buffer : !hal.buffer)[%c0, %c8], + (%buffer_0 : !hal.buffer)[%c0, %c8], + (%transient_buffer : !hal.buffer)[%c0, %c128] + ]) flags("None") + %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32 + util.status.check_ok %status, "failed to wait on timepoint" + %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding], flags = Indirect> +#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64]> : !hal.device +module { + util.global private @__device_0 = #device_target_local + util.global private @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + util.initializer { + %__device_0 = util.global.load @__device_0 : !hal.device + %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + util.return + } + util.global private @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.initializer { + %c-1_i64 = arith.constant -1 : i64 + %c-1 = arith.constant -1 : index + %c0 = arith.constant 0 : index + %c14_i32 = arith.constant 14 : i32 + %0 = util.null : !hal.executable + %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + %__device_0 = util.global.load @__device_0 : !hal.device + %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index + %2 = arith.cmpi eq, %1, %c0 : index + %3 = scf.if %2 -> (!hal.executable) { + %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable + scf.yield %executable : !hal.executable + } else { + util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + scf.yield %0 : !hal.executable + } + util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.return + } + hal.executable private @multiple_results_dispatch_0 { + hal.executable.variant public @embedded_elf_arm_64 target(#executable_target_embedded_elf_arm_64) { + hal.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 ordinal(0) layout(#pipeline_layout) attributes {workgroup_size = [1 : index, 1 : index, 1 : index]} + builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", llvm.target_triple = "arm64-unknown-unknown-eabi-elf"} { + llvm.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}) -> i32 { + %0 = llvm.mlir.constant(0 : i32) : i32 + %1 = llvm.mlir.constant(8 : i64) : i64 + %2 = llvm.mlir.constant(32 : i64) : i64 + %3 = llvm.mlir.constant(64 : index) : i64 + %4 = llvm.mlir.constant(true) : i1 + %5 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %6 = llvm.extractvalue %5[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %7 = llvm.load %6 : !llvm.ptr -> i32 + %8 = llvm.zext %7 : i32 to i64 + %9 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %10 = llvm.load %9 : !llvm.ptr -> !llvm.ptr + llvm.intr.assume %4 ["align"(%10, %3 : !llvm.ptr, i64)] : i1 + %11 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %12 = llvm.extractvalue %11[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %13 = llvm.getelementptr %12[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr + %14 = llvm.load %13 : !llvm.ptr -> !llvm.ptr + %15 = llvm.mul %8, %1 : i64 + %16 = llvm.udiv %15, %2 : i64 + %17 = llvm.getelementptr %14[%16] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.intr.assume %4 ["align"(%17, %3 : !llvm.ptr, i64)] : i1 + %18 = llvm.load %10 {alignment = 4 : i64} : !llvm.ptr -> vector<2xf32> + %19 = llvm.intr.fabs(%18) : (vector<2xf32>) -> vector<2xf32> + llvm.store %19, %17 {alignment = 4 : i64} : vector<2xf32>, !llvm.ptr + llvm.return %0 : i32 + } + } + } + } + util.func private @__multiple_results_memoize_apply(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} { + %c1 = arith.constant 1 : index + %c64_i32 = arith.constant 64 : i32 + %c128 = arith.constant 128 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %c0 = arith.constant 0 : index + %c0_i32 = arith.constant 0 : i32 + %c3 = arith.constant 3 : index + %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + %cmd = hal.command_buffer.create device(%arg0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%arg1) bindings(%c3) : !hal.command_buffer + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([ + (%c0 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([ + (%c1 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") + hal.command_buffer.finalize<%cmd : !hal.command_buffer> + util.return %cmd : !hal.command_buffer + } + util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.initializer { + %c-1_i64 = arith.constant -1 : i64 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %0 = util.call @__multiple_results_memoize_apply(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer + util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.return + } + util.func private @__multiple_results_memoize_lookup(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer { + %0 = util.null : !hal.command_buffer + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %1 = util.cmp.eq %arg0, %__device_0 : !hal.device + %2 = scf.if %1 -> (!hal.command_buffer) { + %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + scf.yield %__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + } else { + scf.yield %0 : !hal.command_buffer + } + util.return %2 : !hal.command_buffer + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c-1_i32 = arith.constant -1 : i32 + %c0_i64 = arith.constant 0 : i64 + %0 = util.null : !hal.fence + %c-1_i64 = arith.constant -1 : i64 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32 + %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer + %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator + hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer + hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128} + %1 = util.call @__multiple_results_memoize_lookup(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer + %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%1) bindings([ + (%buffer : !hal.buffer)[%c0, %c8], + (%buffer_0 : !hal.buffer)[%c0, %c8], + (%transient_buffer : !hal.buffer)[%c0, %c128] + ]) flags("None") + %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32 + util.status.check_ok %status, "failed to wait on timepoint" + %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After ElideRedundantCommandsPass (iree-hal-elide-redundant-commands) //----- // +util.initializer { + %__device_0 = util.global.load @__device_0 : !hal.device + %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + util.return +} + +// -----// IR Dump After ElideRedundantCommandsPass (iree-hal-elide-redundant-commands) //----- // +util.initializer { + %c-1_i64 = arith.constant -1 : i64 + %c-1 = arith.constant -1 : index + %c0 = arith.constant 0 : index + %c14_i32 = arith.constant 14 : i32 + %0 = util.null : !hal.executable + %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + %__device_0 = util.global.load @__device_0 : !hal.device + %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index + %2 = arith.cmpi eq, %1, %c0 : index + %3 = scf.if %2 -> (!hal.executable) { + %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable + scf.yield %executable : !hal.executable + } else { + util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + scf.yield %0 : !hal.executable + } + util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.return +} + +// -----// IR Dump After ElideRedundantCommandsPass (iree-hal-elide-redundant-commands) //----- // +util.initializer { + %c-1_i64 = arith.constant -1 : i64 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %0 = util.call @__multiple_results_memoize_apply(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer + util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.return +} + +// -----// IR Dump After ElideRedundantCommandsPass (iree-hal-elide-redundant-commands) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c-1_i32 = arith.constant -1 : i32 + %c0_i64 = arith.constant 0 : i64 + %0 = util.null : !hal.fence + %c-1_i64 = arith.constant -1 : i64 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32 + %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer + %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator + hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer + hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128} + %1 = util.call @__multiple_results_memoize_lookup(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer + %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%1) bindings([ + (%buffer : !hal.buffer)[%c0, %c8], + (%buffer_0 : !hal.buffer)[%c0, %c8], + (%transient_buffer : !hal.buffer)[%c0, %c128] + ]) flags("None") + %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32 + util.status.check_ok %status, "failed to wait on timepoint" + %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After ElideRedundantCommandsPass (iree-hal-elide-redundant-commands) //----- // +util.func private @__multiple_results_memoize_apply(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} { + %c1 = arith.constant 1 : index + %c64_i32 = arith.constant 64 : i32 + %c128 = arith.constant 128 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %c0 = arith.constant 0 : index + %c0_i32 = arith.constant 0 : i32 + %c3 = arith.constant 3 : index + %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + %cmd = hal.command_buffer.create device(%arg0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%arg1) bindings(%c3) : !hal.command_buffer + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([ + (%c0 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([ + (%c1 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") + hal.command_buffer.finalize<%cmd : !hal.command_buffer> + util.return %cmd : !hal.command_buffer +} + +// -----// IR Dump After ElideRedundantCommandsPass (iree-hal-elide-redundant-commands) //----- // +util.func private @__multiple_results_memoize_lookup(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer { + %0 = util.null : !hal.command_buffer + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %1 = util.cmp.eq %arg0, %__device_0 : !hal.device + %2 = scf.if %1 -> (!hal.command_buffer) { + %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + scf.yield %__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + } else { + scf.yield %0 : !hal.command_buffer + } + util.return %2 : !hal.command_buffer +} + +// -----// IR Dump After InitializeDevicesPass (iree-hal-initialize-devices) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding], flags = Indirect> +module { + util.global private @__device_0 : !hal.device + util.initializer { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %0 = util.null : !hal.device + %device_count = hal.devices.count : index + %1:3 = scf.while (%arg0 = %c0, %arg1 = %c0, %arg2 = %0) : (index, index, !hal.device) -> (index, index, !hal.device) { + %4 = util.cmp.eq %arg2, %0 : !hal.device + %5 = arith.cmpi slt, %arg0, %device_count : index + %6 = arith.andi %4, %5 : i1 + scf.condition(%6) %arg0, %arg1, %arg2 : index, index, !hal.device + } do { + ^bb0(%arg0: index, %arg1: index, %arg2: !hal.device): + %device_n = hal.devices.get %arg0 : !hal.device + %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false + %4 = scf.if %value -> (i1) { + %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + scf.yield %value_1 : i1 + } else { + %false = arith.constant false + scf.yield %false : i1 + } + %5 = arith.cmpi eq, %arg1, %c0 : index + %6 = arith.select %4, %c1, %c0 : index + %7 = arith.addi %arg1, %6 : index + %8 = arith.andi %4, %5 : i1 + %9 = arith.select %8, %device_n, %0 : !hal.device + %10 = arith.addi %arg0, %c1 : index + scf.yield %10, %7, %9 : index, index, !hal.device + } + %2 = util.null : !hal.device + %3 = util.cmp.eq %1#2, %2 : !hal.device + scf.if %3 { + %c18_i32 = arith.constant 18 : i32 + util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>" + } + util.global.store %1#2, @__device_0 : !hal.device + util.return + } + util.global private @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + util.initializer { + %__device_0 = util.global.load @__device_0 : !hal.device + %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + util.return + } + util.global private @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.initializer { + %c-1_i64 = arith.constant -1 : i64 + %c-1 = arith.constant -1 : index + %c0 = arith.constant 0 : index + %c14_i32 = arith.constant 14 : i32 + %0 = util.null : !hal.executable + %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + %__device_0 = util.global.load @__device_0 : !hal.device + %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index + %2 = arith.cmpi eq, %1, %c0 : index + %3 = scf.if %2 -> (!hal.executable) { + %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable + scf.yield %executable : !hal.executable + } else { + util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + scf.yield %0 : !hal.executable + } + util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.return + } + hal.executable private @multiple_results_dispatch_0 { + hal.executable.variant public @embedded_elf_arm_64 target(#executable_target_embedded_elf_arm_64) { + hal.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 ordinal(0) layout(#pipeline_layout) attributes {workgroup_size = [1 : index, 1 : index, 1 : index]} + builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", llvm.target_triple = "arm64-unknown-unknown-eabi-elf"} { + llvm.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}) -> i32 { + %0 = llvm.mlir.constant(0 : i32) : i32 + %1 = llvm.mlir.constant(8 : i64) : i64 + %2 = llvm.mlir.constant(32 : i64) : i64 + %3 = llvm.mlir.constant(64 : index) : i64 + %4 = llvm.mlir.constant(true) : i1 + %5 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %6 = llvm.extractvalue %5[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %7 = llvm.load %6 : !llvm.ptr -> i32 + %8 = llvm.zext %7 : i32 to i64 + %9 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %10 = llvm.load %9 : !llvm.ptr -> !llvm.ptr + llvm.intr.assume %4 ["align"(%10, %3 : !llvm.ptr, i64)] : i1 + %11 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %12 = llvm.extractvalue %11[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %13 = llvm.getelementptr %12[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr + %14 = llvm.load %13 : !llvm.ptr -> !llvm.ptr + %15 = llvm.mul %8, %1 : i64 + %16 = llvm.udiv %15, %2 : i64 + %17 = llvm.getelementptr %14[%16] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.intr.assume %4 ["align"(%17, %3 : !llvm.ptr, i64)] : i1 + %18 = llvm.load %10 {alignment = 4 : i64} : !llvm.ptr -> vector<2xf32> + %19 = llvm.intr.fabs(%18) : (vector<2xf32>) -> vector<2xf32> + llvm.store %19, %17 {alignment = 4 : i64} : vector<2xf32>, !llvm.ptr + llvm.return %0 : i32 + } + } + } + } + util.func private @__multiple_results_memoize_apply(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} { + %c1 = arith.constant 1 : index + %c64_i32 = arith.constant 64 : i32 + %c128 = arith.constant 128 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %c0 = arith.constant 0 : index + %c0_i32 = arith.constant 0 : i32 + %c3 = arith.constant 3 : index + %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + %cmd = hal.command_buffer.create device(%arg0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%arg1) bindings(%c3) : !hal.command_buffer + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([ + (%c0 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([ + (%c1 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") + hal.command_buffer.finalize<%cmd : !hal.command_buffer> + util.return %cmd : !hal.command_buffer + } + util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.initializer { + %c-1_i64 = arith.constant -1 : i64 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %0 = util.call @__multiple_results_memoize_apply(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer + util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.return + } + util.func private @__multiple_results_memoize_lookup(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer { + %0 = util.null : !hal.command_buffer + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %1 = util.cmp.eq %arg0, %__device_0 : !hal.device + %2 = scf.if %1 -> (!hal.command_buffer) { + %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + scf.yield %__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + } else { + scf.yield %0 : !hal.command_buffer + } + util.return %2 : !hal.command_buffer + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c-1_i32 = arith.constant -1 : i32 + %c0_i64 = arith.constant 0 : i64 + %0 = util.null : !hal.fence + %c-1_i64 = arith.constant -1 : i64 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32 + %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer + %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator + hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer + hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128} + %1 = util.call @__multiple_results_memoize_lookup(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer + %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%1) bindings([ + (%buffer : !hal.buffer)[%c0, %c8], + (%buffer_0 : !hal.buffer)[%c0, %c8], + (%transient_buffer : !hal.buffer)[%c0, %c128] + ]) flags("None") + %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32 + util.status.check_ok %status, "failed to wait on timepoint" + %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After IREECodegenAffineExpandIndexOpsPass (iree-codegen-affine-expand-index-ops) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding], flags = Indirect> +module { + util.global private @__device_0 : !hal.device + util.initializer { + %c18_i32 = arith.constant 18 : i32 + %false = arith.constant false + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %0 = util.null : !hal.device + %device_count = hal.devices.count : index + %1:3 = scf.while (%arg0 = %c0, %arg1 = %c0, %arg2 = %0) : (index, index, !hal.device) -> (index, index, !hal.device) { + %3 = util.cmp.eq %arg2, %0 : !hal.device + %4 = arith.cmpi slt, %arg0, %device_count : index + %5 = arith.andi %3, %4 : i1 + scf.condition(%5) %arg0, %arg1, %arg2 : index, index, !hal.device + } do { + ^bb0(%arg0: index, %arg1: index, %arg2: !hal.device): + %device_n = hal.devices.get %arg0 : !hal.device + %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false + %3 = scf.if %value -> (i1) { + %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + scf.yield %value_1 : i1 + } else { + scf.yield %false : i1 + } + %4 = arith.cmpi eq, %arg1, %c0 : index + %5 = arith.select %3, %c1, %c0 : index + %6 = arith.addi %arg1, %5 : index + %7 = arith.andi %3, %4 : i1 + %8 = arith.select %7, %device_n, %0 : !hal.device + %9 = arith.addi %arg0, %c1 : index + scf.yield %9, %6, %8 : index, index, !hal.device + } + %2 = util.cmp.eq %1#2, %0 : !hal.device + scf.if %2 { + util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>" + } + util.global.store %1#2, @__device_0 : !hal.device + util.return + } + util.global private @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + util.initializer { + %__device_0 = util.global.load @__device_0 : !hal.device + %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + util.return + } + util.global private @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.initializer { + %c-1_i64 = arith.constant -1 : i64 + %c-1 = arith.constant -1 : index + %c0 = arith.constant 0 : index + %c14_i32 = arith.constant 14 : i32 + %0 = util.null : !hal.executable + %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + %__device_0 = util.global.load @__device_0 : !hal.device + %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index + %2 = arith.cmpi eq, %1, %c0 : index + %3 = scf.if %2 -> (!hal.executable) { + %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable + scf.yield %executable : !hal.executable + } else { + util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + scf.yield %0 : !hal.executable + } + util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.return + } + hal.executable private @multiple_results_dispatch_0 { + hal.executable.variant public @embedded_elf_arm_64 target(#executable_target_embedded_elf_arm_64) { + hal.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 ordinal(0) layout(#pipeline_layout) attributes {workgroup_size = [1 : index, 1 : index, 1 : index]} + builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", llvm.target_triple = "arm64-unknown-unknown-eabi-elf"} { + llvm.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}) -> i32 { + %0 = llvm.mlir.constant(0 : i32) : i32 + %1 = llvm.mlir.constant(8 : i64) : i64 + %2 = llvm.mlir.constant(32 : i64) : i64 + %3 = llvm.mlir.constant(64 : index) : i64 + %4 = llvm.mlir.constant(true) : i1 + %5 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %6 = llvm.extractvalue %5[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %7 = llvm.load %6 : !llvm.ptr -> i32 + %8 = llvm.zext %7 : i32 to i64 + %9 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %10 = llvm.load %9 : !llvm.ptr -> !llvm.ptr + llvm.intr.assume %4 ["align"(%10, %3 : !llvm.ptr, i64)] : i1 + %11 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %12 = llvm.extractvalue %11[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %13 = llvm.getelementptr %12[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr + %14 = llvm.load %13 : !llvm.ptr -> !llvm.ptr + %15 = llvm.mul %8, %1 : i64 + %16 = llvm.udiv %15, %2 : i64 + %17 = llvm.getelementptr %14[%16] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.intr.assume %4 ["align"(%17, %3 : !llvm.ptr, i64)] : i1 + %18 = llvm.load %10 {alignment = 4 : i64} : !llvm.ptr -> vector<2xf32> + %19 = llvm.intr.fabs(%18) : (vector<2xf32>) -> vector<2xf32> + llvm.store %19, %17 {alignment = 4 : i64} : vector<2xf32>, !llvm.ptr + llvm.return %0 : i32 + } + } + } + } + util.func private @__multiple_results_memoize_apply(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} { + %c1 = arith.constant 1 : index + %c64_i32 = arith.constant 64 : i32 + %c128 = arith.constant 128 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %c0 = arith.constant 0 : index + %c0_i32 = arith.constant 0 : i32 + %c3 = arith.constant 3 : index + %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + %cmd = hal.command_buffer.create device(%arg0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%arg1) bindings(%c3) : !hal.command_buffer + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([ + (%c0 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([ + (%c1 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") + hal.command_buffer.finalize<%cmd : !hal.command_buffer> + util.return %cmd : !hal.command_buffer + } + util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.initializer { + %c-1_i64 = arith.constant -1 : i64 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %0 = util.call @__multiple_results_memoize_apply(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer + util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.return + } + util.func private @__multiple_results_memoize_lookup(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer { + %0 = util.null : !hal.command_buffer + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %1 = util.cmp.eq %arg0, %__device_0 : !hal.device + %2 = scf.if %1 -> (!hal.command_buffer) { + %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + scf.yield %__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + } else { + scf.yield %0 : !hal.command_buffer + } + util.return %2 : !hal.command_buffer + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c-1_i32 = arith.constant -1 : i32 + %c0_i64 = arith.constant 0 : i64 + %0 = util.null : !hal.fence + %c-1_i64 = arith.constant -1 : i64 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32 + %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer + %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator + hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer + hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128} + %1 = util.call @__multiple_results_memoize_lookup(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer + %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%1) bindings([ + (%buffer : !hal.buffer)[%c0, %c8], + (%buffer_0 : !hal.buffer)[%c0, %c8], + (%transient_buffer : !hal.buffer)[%c0, %c128] + ]) flags("None") + %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32 + util.status.check_ok %status, "failed to wait on timepoint" + %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After IREECodegenLowerAffinePass (iree-codegen-lower-affine) //----- // +#executable_target_embedded_elf_arm_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}> +#pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding], flags = Indirect> +module { + util.global private @__device_0 : !hal.device + util.initializer { + %c18_i32 = arith.constant 18 : i32 + %false = arith.constant false + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %0 = util.null : !hal.device + %device_count = hal.devices.count : index + %1:3 = scf.while (%arg0 = %c0, %arg1 = %c0, %arg2 = %0) : (index, index, !hal.device) -> (index, index, !hal.device) { + %3 = util.cmp.eq %arg2, %0 : !hal.device + %4 = arith.cmpi slt, %arg0, %device_count : index + %5 = arith.andi %3, %4 : i1 + scf.condition(%5) %arg0, %arg1, %arg2 : index, index, !hal.device + } do { + ^bb0(%arg0: index, %arg1: index, %arg2: !hal.device): + %device_n = hal.devices.get %arg0 : !hal.device + %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false + %3 = scf.if %value -> (i1) { + %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + scf.yield %value_1 : i1 + } else { + scf.yield %false : i1 + } + %4 = arith.cmpi eq, %arg1, %c0 : index + %5 = arith.select %3, %c1, %c0 : index + %6 = arith.addi %arg1, %5 : index + %7 = arith.andi %3, %4 : i1 + %8 = arith.select %7, %device_n, %0 : !hal.device + %9 = arith.addi %arg0, %c1 : index + scf.yield %9, %6, %8 : index, index, !hal.device + } + %2 = util.cmp.eq %1#2, %0 : !hal.device + scf.if %2 { + util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>" + } + util.global.store %1#2, @__device_0 : !hal.device + util.return + } + util.global private @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + util.initializer { + %__device_0 = util.global.load @__device_0 : !hal.device + %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + util.return + } + util.global private @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.initializer { + %c-1_i64 = arith.constant -1 : i64 + %c-1 = arith.constant -1 : index + %c0 = arith.constant 0 : index + %c14_i32 = arith.constant 14 : i32 + %0 = util.null : !hal.executable + %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + %__device_0 = util.global.load @__device_0 : !hal.device + %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index + %2 = arith.cmpi eq, %1, %c0 : index + %3 = scf.if %2 -> (!hal.executable) { + %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable + scf.yield %executable : !hal.executable + } else { + util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + scf.yield %0 : !hal.executable + } + util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.return + } + hal.executable private @multiple_results_dispatch_0 { + hal.executable.variant public @embedded_elf_arm_64 target(#executable_target_embedded_elf_arm_64) { + hal.executable.export public @multiple_results_dispatch_0_elementwise_2_f32 ordinal(0) layout(#pipeline_layout) attributes {workgroup_size = [1 : index, 1 : index, 1 : index]} + builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", llvm.target_triple = "arm64-unknown-unknown-eabi-elf"} { + llvm.func @multiple_results_dispatch_0_elementwise_2_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}) -> i32 { + %0 = llvm.mlir.constant(0 : i32) : i32 + %1 = llvm.mlir.constant(8 : i64) : i64 + %2 = llvm.mlir.constant(32 : i64) : i64 + %3 = llvm.mlir.constant(64 : index) : i64 + %4 = llvm.mlir.constant(true) : i1 + %5 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %6 = llvm.extractvalue %5[9] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %7 = llvm.load %6 : !llvm.ptr -> i32 + %8 = llvm.zext %7 : i32 to i64 + %9 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %10 = llvm.load %9 : !llvm.ptr -> !llvm.ptr + llvm.intr.assume %4 ["align"(%10, %3 : !llvm.ptr, i64)] : i1 + %11 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %12 = llvm.extractvalue %11[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)> + %13 = llvm.getelementptr %12[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr + %14 = llvm.load %13 : !llvm.ptr -> !llvm.ptr + %15 = llvm.mul %8, %1 : i64 + %16 = llvm.udiv %15, %2 : i64 + %17 = llvm.getelementptr %14[%16] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.intr.assume %4 ["align"(%17, %3 : !llvm.ptr, i64)] : i1 + %18 = llvm.load %10 {alignment = 4 : i64} : !llvm.ptr -> vector<2xf32> + %19 = llvm.intr.fabs(%18) : (vector<2xf32>) -> vector<2xf32> + llvm.store %19, %17 {alignment = 4 : i64} : vector<2xf32>, !llvm.ptr + llvm.return %0 : i32 + } + } + } + } + util.func private @__multiple_results_memoize_apply(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} { + %c1 = arith.constant 1 : index + %c64_i32 = arith.constant 64 : i32 + %c128 = arith.constant 128 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %c0 = arith.constant 0 : index + %c0_i32 = arith.constant 0 : i32 + %c3 = arith.constant 3 : index + %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + %cmd = hal.command_buffer.create device(%arg0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%arg1) bindings(%c3) : !hal.command_buffer + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([ + (%c0 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([ + (%c1 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") + hal.command_buffer.finalize<%cmd : !hal.command_buffer> + util.return %cmd : !hal.command_buffer + } + util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.initializer { + %c-1_i64 = arith.constant -1 : i64 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %0 = util.call @__multiple_results_memoize_apply(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer + util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.return + } + util.func private @__multiple_results_memoize_lookup(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer { + %0 = util.null : !hal.command_buffer + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %1 = util.cmp.eq %arg0, %__device_0 : !hal.device + %2 = scf.if %1 -> (!hal.command_buffer) { + %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + scf.yield %__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + } else { + scf.yield %0 : !hal.command_buffer + } + util.return %2 : !hal.command_buffer + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c-1_i32 = arith.constant -1 : i32 + %c0_i64 = arith.constant 0 : i64 + %0 = util.null : !hal.fence + %c-1_i64 = arith.constant -1 : i64 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32 + %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer + %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator + hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer + hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128} + %1 = util.call @__multiple_results_memoize_lookup(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer + %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%1) bindings([ + (%buffer : !hal.buffer)[%c0, %c8], + (%buffer_0 : !hal.buffer)[%c0, %c8], + (%transient_buffer : !hal.buffer)[%c0, %c128] + ]) flags("None") + %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32 + util.status.check_ok %status, "failed to wait on timepoint" + %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After SCFToControlFlowPass (convert-scf-to-cf) //----- // +util.initializer { + %__device_0 = util.global.load @__device_0 : !hal.device + %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + util.return +} + +// -----// IR Dump After SCFToControlFlowPass (convert-scf-to-cf) //----- // +util.initializer { + %c18_i32 = arith.constant 18 : i32 + %false = arith.constant false + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %0 = util.null : !hal.device + %device_count = hal.devices.count : index + cf.br ^bb1(%c0, %c0, %0 : index, index, !hal.device) +^bb1(%1: index, %2: index, %3: !hal.device): // 2 preds: ^bb0, ^bb6 + %4 = util.cmp.eq %3, %0 : !hal.device + %5 = arith.cmpi slt, %1, %device_count : index + %6 = arith.andi %4, %5 : i1 + cf.cond_br %6, ^bb2(%1, %2, %3 : index, index, !hal.device), ^bb7 +^bb2(%7: index, %8: index, %9: !hal.device): // pred: ^bb1 + %device_n = hal.devices.get %7 : !hal.device + %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false + cf.cond_br %value, ^bb3, ^bb4 +^bb3: // pred: ^bb2 + %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + cf.br ^bb5(%value_1 : i1) +^bb4: // pred: ^bb2 + cf.br ^bb5(%false : i1) +^bb5(%10: i1): // 2 preds: ^bb3, ^bb4 + cf.br ^bb6 +^bb6: // pred: ^bb5 + %11 = arith.cmpi eq, %8, %c0 : index + %12 = arith.select %10, %c1, %c0 : index + %13 = arith.addi %8, %12 : index + %14 = arith.andi %10, %11 : i1 + %15 = arith.select %14, %device_n, %0 : !hal.device + %16 = arith.addi %7, %c1 : index + cf.br ^bb1(%16, %13, %15 : index, index, !hal.device) +^bb7: // pred: ^bb1 + %17 = util.cmp.eq %3, %0 : !hal.device + cf.cond_br %17, ^bb8, ^bb9 +^bb8: // pred: ^bb7 + util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>" + cf.br ^bb9 +^bb9: // 2 preds: ^bb7, ^bb8 + util.global.store %3, @__device_0 : !hal.device + util.return +} + +// -----// IR Dump After SCFToControlFlowPass (convert-scf-to-cf) //----- // +util.initializer { + %c-1_i64 = arith.constant -1 : i64 + %c-1 = arith.constant -1 : index + %c0 = arith.constant 0 : index + %c14_i32 = arith.constant 14 : i32 + %0 = util.null : !hal.executable + %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + %__device_0 = util.global.load @__device_0 : !hal.device + %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index + %2 = arith.cmpi eq, %1, %c0 : index + cf.cond_br %2, ^bb1, ^bb2 +^bb1: // pred: ^bb0 + %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable + cf.br ^bb3(%executable : !hal.executable) +^bb2: // pred: ^bb0 + util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + cf.br ^bb3(%0 : !hal.executable) +^bb3(%3: !hal.executable): // 2 preds: ^bb1, ^bb2 + cf.br ^bb4 +^bb4: // pred: ^bb3 + util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.return +} + +// -----// IR Dump After SCFToControlFlowPass (convert-scf-to-cf) //----- // +util.func private @__multiple_results_memoize_lookup(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer { + %0 = util.null : !hal.command_buffer + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %1 = util.cmp.eq %arg0, %__device_0 : !hal.device + cf.cond_br %1, ^bb1, ^bb2 +^bb1: // pred: ^bb0 + %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + cf.br ^bb3(%__multiple_results_memoize_result_0_device_0 : !hal.command_buffer) +^bb2: // pred: ^bb0 + cf.br ^bb3(%0 : !hal.command_buffer) +^bb3(%2: !hal.command_buffer): // 2 preds: ^bb1, ^bb2 + cf.br ^bb4 +^bb4: // pred: ^bb3 + util.return %2 : !hal.command_buffer +} + +// -----// IR Dump After SCFToControlFlowPass (convert-scf-to-cf) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c-1_i32 = arith.constant -1 : i32 + %c0_i64 = arith.constant 0 : i64 + %0 = util.null : !hal.fence + %c-1_i64 = arith.constant -1 : i64 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32 + %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer + %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator + hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer + hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128} + %1 = util.call @__multiple_results_memoize_lookup(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer + %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%1) bindings([ + (%buffer : !hal.buffer)[%c0, %c8], + (%buffer_0 : !hal.buffer)[%c0, %c8], + (%transient_buffer : !hal.buffer)[%c0, %c128] + ]) flags("None") + %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32 + util.status.check_ok %status, "failed to wait on timepoint" + %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After SCFToControlFlowPass (convert-scf-to-cf) //----- // +util.func private @__multiple_results_memoize_apply(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} { + %c1 = arith.constant 1 : index + %c64_i32 = arith.constant 64 : i32 + %c128 = arith.constant 128 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %c0 = arith.constant 0 : index + %c0_i32 = arith.constant 0 : i32 + %c3 = arith.constant 3 : index + %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + %cmd = hal.command_buffer.create device(%arg0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%arg1) bindings(%c3) : !hal.command_buffer + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([ + (%c0 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([ + (%c1 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") + hal.command_buffer.finalize<%cmd : !hal.command_buffer> + util.return %cmd : !hal.command_buffer +} + +// -----// IR Dump After SCFToControlFlowPass (convert-scf-to-cf) //----- // +util.initializer { + %c-1_i64 = arith.constant -1 : i64 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %0 = util.call @__multiple_results_memoize_apply(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer + util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.return +} + +// -----// IR Dump After SerializeTargetExecutablesPass (iree-hal-serialize-target-executables) //----- // +hal.executable private @multiple_results_dispatch_0 { + hal.executable.binary public @embedded_elf_arm_64 attributes {data = dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8>, format = "embedded-elf-arm_64", mime_type = "application/x-elf"} +} + +// -----// IR Dump After SerializeAllExecutablesPass (iree-hal-serialize-all-executables) //----- // +hal.executable private @multiple_results_dispatch_0 { + hal.executable.binary public @embedded_elf_arm_64 attributes {data = dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8>, format = "embedded-elf-arm_64", mime_type = "application/x-elf"} +} + +// -----// IR Dump After PruneExecutablesPass (iree-hal-prune-executables) //----- // +module { + util.global private @__device_0 : !hal.device + util.initializer { + %c18_i32 = arith.constant 18 : i32 + %false = arith.constant false + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %0 = util.null : !hal.device + %device_count = hal.devices.count : index + cf.br ^bb1(%c0, %c0, %0 : index, index, !hal.device) + ^bb1(%1: index, %2: index, %3: !hal.device): // 2 preds: ^bb0, ^bb6 + %4 = util.cmp.eq %3, %0 : !hal.device + %5 = arith.cmpi slt, %1, %device_count : index + %6 = arith.andi %4, %5 : i1 + cf.cond_br %6, ^bb2(%1, %2, %3 : index, index, !hal.device), ^bb7 + ^bb2(%7: index, %8: index, %9: !hal.device): // pred: ^bb1 + %device_n = hal.devices.get %7 : !hal.device + %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false + cf.cond_br %value, ^bb3, ^bb4 + ^bb3: // pred: ^bb2 + %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + cf.br ^bb5(%value_1 : i1) + ^bb4: // pred: ^bb2 + cf.br ^bb5(%false : i1) + ^bb5(%10: i1): // 2 preds: ^bb3, ^bb4 + cf.br ^bb6 + ^bb6: // pred: ^bb5 + %11 = arith.cmpi eq, %8, %c0 : index + %12 = arith.select %10, %c1, %c0 : index + %13 = arith.addi %8, %12 : index + %14 = arith.andi %10, %11 : i1 + %15 = arith.select %14, %device_n, %0 : !hal.device + %16 = arith.addi %7, %c1 : index + cf.br ^bb1(%16, %13, %15 : index, index, !hal.device) + ^bb7: // pred: ^bb1 + %17 = util.cmp.eq %3, %0 : !hal.device + cf.cond_br %17, ^bb8, ^bb9 + ^bb8: // pred: ^bb7 + util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>" + cf.br ^bb9 + ^bb9: // 2 preds: ^bb7, ^bb8 + util.global.store %3, @__device_0 : !hal.device + util.return + } + util.global private @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + util.initializer { + %__device_0 = util.global.load @__device_0 : !hal.device + %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + util.return + } + util.global private @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.initializer { + %c-1_i64 = arith.constant -1 : i64 + %c-1 = arith.constant -1 : index + %c0 = arith.constant 0 : index + %c14_i32 = arith.constant 14 : i32 + %0 = util.null : !hal.executable + %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + %__device_0 = util.global.load @__device_0 : !hal.device + %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index + %2 = arith.cmpi eq, %1, %c0 : index + cf.cond_br %2, ^bb1, ^bb2 + ^bb1: // pred: ^bb0 + %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable + cf.br ^bb3(%executable : !hal.executable) + ^bb2: // pred: ^bb0 + util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + cf.br ^bb3(%0 : !hal.executable) + ^bb3(%3: !hal.executable): // 2 preds: ^bb1, ^bb2 + cf.br ^bb4 + ^bb4: // pred: ^bb3 + util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.return + } + hal.executable private @multiple_results_dispatch_0 { + hal.executable.binary public @embedded_elf_arm_64 attributes {data = dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8>, format = "embedded-elf-arm_64", mime_type = "application/x-elf"} + } + util.func private @__multiple_results_memoize_apply(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} { + %c1 = arith.constant 1 : index + %c64_i32 = arith.constant 64 : i32 + %c128 = arith.constant 128 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %c0 = arith.constant 0 : index + %c0_i32 = arith.constant 0 : i32 + %c3 = arith.constant 3 : index + %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + %cmd = hal.command_buffer.create device(%arg0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%arg1) bindings(%c3) : !hal.command_buffer + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([ + (%c0 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([ + (%c1 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") + hal.command_buffer.finalize<%cmd : !hal.command_buffer> + util.return %cmd : !hal.command_buffer + } + util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.initializer { + %c-1_i64 = arith.constant -1 : i64 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %0 = util.call @__multiple_results_memoize_apply(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer + util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.return + } + util.func private @__multiple_results_memoize_lookup(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer { + %0 = util.null : !hal.command_buffer + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %1 = util.cmp.eq %arg0, %__device_0 : !hal.device + cf.cond_br %1, ^bb1, ^bb2 + ^bb1: // pred: ^bb0 + %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + cf.br ^bb3(%__multiple_results_memoize_result_0_device_0 : !hal.command_buffer) + ^bb2: // pred: ^bb0 + cf.br ^bb3(%0 : !hal.command_buffer) + ^bb3(%2: !hal.command_buffer): // 2 preds: ^bb1, ^bb2 + cf.br ^bb4 + ^bb4: // pred: ^bb3 + util.return %2 : !hal.command_buffer + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c-1_i32 = arith.constant -1 : i32 + %c0_i64 = arith.constant 0 : i64 + %0 = util.null : !hal.fence + %c-1_i64 = arith.constant -1 : i64 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32 + %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer + %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator + hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer + hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128} + %1 = util.call @__multiple_results_memoize_lookup(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer + %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%1) bindings([ + (%buffer : !hal.buffer)[%c0, %c8], + (%buffer_0 : !hal.buffer)[%c0, %c8], + (%transient_buffer : !hal.buffer)[%c0, %c128] + ]) flags("None") + %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32 + util.status.check_ok %status, "failed to wait on timepoint" + %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After SymbolDCE (symbol-dce) //----- // +module { + util.global private @__device_0 : !hal.device + util.initializer { + %c18_i32 = arith.constant 18 : i32 + %false = arith.constant false + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %0 = util.null : !hal.device + %device_count = hal.devices.count : index + cf.br ^bb1(%c0, %c0, %0 : index, index, !hal.device) + ^bb1(%1: index, %2: index, %3: !hal.device): // 2 preds: ^bb0, ^bb6 + %4 = util.cmp.eq %3, %0 : !hal.device + %5 = arith.cmpi slt, %1, %device_count : index + %6 = arith.andi %4, %5 : i1 + cf.cond_br %6, ^bb2(%1, %2, %3 : index, index, !hal.device), ^bb7 + ^bb2(%7: index, %8: index, %9: !hal.device): // pred: ^bb1 + %device_n = hal.devices.get %7 : !hal.device + %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false + cf.cond_br %value, ^bb3, ^bb4 + ^bb3: // pred: ^bb2 + %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + cf.br ^bb5(%value_1 : i1) + ^bb4: // pred: ^bb2 + cf.br ^bb5(%false : i1) + ^bb5(%10: i1): // 2 preds: ^bb3, ^bb4 + cf.br ^bb6 + ^bb6: // pred: ^bb5 + %11 = arith.cmpi eq, %8, %c0 : index + %12 = arith.select %10, %c1, %c0 : index + %13 = arith.addi %8, %12 : index + %14 = arith.andi %10, %11 : i1 + %15 = arith.select %14, %device_n, %0 : !hal.device + %16 = arith.addi %7, %c1 : index + cf.br ^bb1(%16, %13, %15 : index, index, !hal.device) + ^bb7: // pred: ^bb1 + %17 = util.cmp.eq %3, %0 : !hal.device + cf.cond_br %17, ^bb8, ^bb9 + ^bb8: // pred: ^bb7 + util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>" + cf.br ^bb9 + ^bb9: // 2 preds: ^bb7, ^bb8 + util.global.store %3, @__device_0 : !hal.device + util.return + } + util.global private @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + util.initializer { + %__device_0 = util.global.load @__device_0 : !hal.device + %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + util.return + } + util.global private @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.initializer { + %c-1_i64 = arith.constant -1 : i64 + %c-1 = arith.constant -1 : index + %c0 = arith.constant 0 : index + %c14_i32 = arith.constant 14 : i32 + %0 = util.null : !hal.executable + %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + %__device_0 = util.global.load @__device_0 : !hal.device + %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index + %2 = arith.cmpi eq, %1, %c0 : index + cf.cond_br %2, ^bb1, ^bb2 + ^bb1: // pred: ^bb0 + %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable + cf.br ^bb3(%executable : !hal.executable) + ^bb2: // pred: ^bb0 + util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + cf.br ^bb3(%0 : !hal.executable) + ^bb3(%3: !hal.executable): // 2 preds: ^bb1, ^bb2 + cf.br ^bb4 + ^bb4: // pred: ^bb3 + util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.return + } + hal.executable private @multiple_results_dispatch_0 { + hal.executable.binary public @embedded_elf_arm_64 attributes {data = dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8>, format = "embedded-elf-arm_64", mime_type = "application/x-elf"} + } + util.func private @__multiple_results_memoize_apply(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} { + %c1 = arith.constant 1 : index + %c64_i32 = arith.constant 64 : i32 + %c128 = arith.constant 128 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %c0 = arith.constant 0 : index + %c0_i32 = arith.constant 0 : i32 + %c3 = arith.constant 3 : index + %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + %cmd = hal.command_buffer.create device(%arg0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%arg1) bindings(%c3) : !hal.command_buffer + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([ + (%c0 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([ + (%c1 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") + hal.command_buffer.finalize<%cmd : !hal.command_buffer> + util.return %cmd : !hal.command_buffer + } + util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.initializer { + %c-1_i64 = arith.constant -1 : i64 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %0 = util.call @__multiple_results_memoize_apply(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer + util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.return + } + util.func private @__multiple_results_memoize_lookup(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer { + %0 = util.null : !hal.command_buffer + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %1 = util.cmp.eq %arg0, %__device_0 : !hal.device + cf.cond_br %1, ^bb1, ^bb2 + ^bb1: // pred: ^bb0 + %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + cf.br ^bb3(%__multiple_results_memoize_result_0_device_0 : !hal.command_buffer) + ^bb2: // pred: ^bb0 + cf.br ^bb3(%0 : !hal.command_buffer) + ^bb3(%2: !hal.command_buffer): // 2 preds: ^bb1, ^bb2 + cf.br ^bb4 + ^bb4: // pred: ^bb3 + util.return %2 : !hal.command_buffer + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c-1_i32 = arith.constant -1 : i32 + %c0_i64 = arith.constant 0 : i64 + %0 = util.null : !hal.fence + %c-1_i64 = arith.constant -1 : i64 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32 + %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer + %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator + hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer + hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128} + %1 = util.call @__multiple_results_memoize_lookup(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer + %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%1) bindings([ + (%buffer : !hal.buffer)[%c0, %c8], + (%buffer_0 : !hal.buffer)[%c0, %c8], + (%transient_buffer : !hal.buffer)[%c0, %c128] + ]) flags("None") + %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32 + util.status.check_ok %status, "failed to wait on timepoint" + %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After Canonicalizer (canonicalize) //----- // +util.initializer { + %__device_0 = util.global.load @__device_0 : !hal.device + %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + util.return +} + +// -----// IR Dump After Canonicalizer (canonicalize) //----- // +util.initializer { + %c-1_i64 = arith.constant -1 : i64 + %c-1 = arith.constant -1 : index + %c0 = arith.constant 0 : index + %c14_i32 = arith.constant 14 : i32 + %0 = util.null : !hal.executable + %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + %__device_0 = util.global.load @__device_0 : !hal.device + %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index + %2 = arith.cmpi eq, %1, %c0 : index + cf.cond_br %2, ^bb1, ^bb2 +^bb1: // pred: ^bb0 + %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable + cf.br ^bb3(%executable : !hal.executable) +^bb2: // pred: ^bb0 + util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + cf.br ^bb3(%0 : !hal.executable) +^bb3(%3: !hal.executable): // 2 preds: ^bb1, ^bb2 + util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.return +} + +// -----// IR Dump After Canonicalizer (canonicalize) //----- // +util.func private @__multiple_results_memoize_lookup(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer { + %0 = util.null : !hal.command_buffer + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %1 = util.cmp.eq %arg0, %__device_0 : !hal.device + cf.cond_br %1, ^bb1, ^bb2(%0 : !hal.command_buffer) +^bb1: // pred: ^bb0 + %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + cf.br ^bb2(%__multiple_results_memoize_result_0_device_0 : !hal.command_buffer) +^bb2(%2: !hal.command_buffer): // 2 preds: ^bb0, ^bb1 + util.return %2 : !hal.command_buffer +} + +// -----// IR Dump After CSE (cse) //----- // +util.func private @__multiple_results_memoize_lookup(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer { + %0 = util.null : !hal.command_buffer + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %1 = util.cmp.eq %arg0, %__device_0 : !hal.device + cf.cond_br %1, ^bb1, ^bb2(%0 : !hal.command_buffer) +^bb1: // pred: ^bb0 + %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + cf.br ^bb2(%__multiple_results_memoize_result_0_device_0 : !hal.command_buffer) +^bb2(%2: !hal.command_buffer): // 2 preds: ^bb0, ^bb1 + util.return %2 : !hal.command_buffer +} + +// -----// IR Dump After CSE (cse) //----- // +util.initializer { + %c-1_i64 = arith.constant -1 : i64 + %c-1 = arith.constant -1 : index + %c0 = arith.constant 0 : index + %c14_i32 = arith.constant 14 : i32 + %0 = util.null : !hal.executable + %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + %__device_0 = util.global.load @__device_0 : !hal.device + %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index + %2 = arith.cmpi eq, %1, %c0 : index + cf.cond_br %2, ^bb1, ^bb2 +^bb1: // pred: ^bb0 + %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable + cf.br ^bb3(%executable : !hal.executable) +^bb2: // pred: ^bb0 + util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + cf.br ^bb3(%0 : !hal.executable) +^bb3(%3: !hal.executable): // 2 preds: ^bb1, ^bb2 + util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.return +} + +// -----// IR Dump After Canonicalizer (canonicalize) //----- // +util.func private @__multiple_results_memoize_apply(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} { + %c1 = arith.constant 1 : index + %c64_i32 = arith.constant 64 : i32 + %c128 = arith.constant 128 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %c0 = arith.constant 0 : index + %c0_i32 = arith.constant 0 : i32 + %c3 = arith.constant 3 : index + %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + %cmd = hal.command_buffer.create device(%arg0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%arg1) bindings(%c3) : !hal.command_buffer + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([ + (%c0 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([ + (%c1 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") + hal.command_buffer.finalize<%cmd : !hal.command_buffer> + util.return %cmd : !hal.command_buffer +} + +// -----// IR Dump After Canonicalizer (canonicalize) //----- // +util.initializer { + %c18_i32 = arith.constant 18 : i32 + %false = arith.constant false + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %0 = util.null : !hal.device + %device_count = hal.devices.count : index + cf.br ^bb1(%c0, %c0, %0 : index, index, !hal.device) +^bb1(%1: index, %2: index, %3: !hal.device): // 2 preds: ^bb0, ^bb4 + %4 = util.cmp.eq %3, %0 : !hal.device + %5 = arith.cmpi slt, %1, %device_count : index + %6 = arith.andi %4, %5 : i1 + cf.cond_br %6, ^bb2(%1, %2 : index, index), ^bb5 +^bb2(%7: index, %8: index): // pred: ^bb1 + %device_n = hal.devices.get %7 : !hal.device + %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false + cf.cond_br %value, ^bb3, ^bb4(%false : i1) +^bb3: // pred: ^bb2 + %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + cf.br ^bb4(%value_1 : i1) +^bb4(%9: i1): // 2 preds: ^bb2, ^bb3 + %10 = arith.cmpi eq, %8, %c0 : index + %11 = arith.select %9, %c1, %c0 : index + %12 = arith.addi %8, %11 : index + %13 = arith.andi %9, %10 : i1 + %14 = arith.select %13, %device_n, %0 : !hal.device + %15 = arith.addi %7, %c1 : index + cf.br ^bb1(%15, %12, %14 : index, index, !hal.device) +^bb5: // pred: ^bb1 + %16 = util.cmp.eq %3, %0 : !hal.device + cf.cond_br %16, ^bb6, ^bb7 +^bb6: // pred: ^bb5 + util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>" + cf.br ^bb7 +^bb7: // 2 preds: ^bb5, ^bb6 + util.global.store %3, @__device_0 : !hal.device + util.return +} + +// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // +util.initializer { + %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + %__device_0 = util.global.load @__device_0 : !hal.device + %c-1_i64 = arith.constant -1 : i64 + %c-1 = arith.constant -1 : index + %c0 = arith.constant 0 : index + %c14_i32 = arith.constant 14 : i32 + %0 = util.null : !hal.executable + %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index + %2 = arith.cmpi eq, %1, %c0 : index + cf.cond_br %2, ^bb1, ^bb2 +^bb1: // pred: ^bb0 + %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable + cf.br ^bb3(%executable : !hal.executable) +^bb2: // pred: ^bb0 + util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + cf.br ^bb3(%0 : !hal.executable) +^bb3(%3: !hal.executable): // 2 preds: ^bb1, ^bb2 + util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.return +} + +// -----// IR Dump After Canonicalizer (canonicalize) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c-1_i32 = arith.constant -1 : i32 + %c0_i64 = arith.constant 0 : i64 + %0 = util.null : !hal.fence + %c-1_i64 = arith.constant -1 : i64 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32 + %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer + %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator + hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer + hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128} + %1 = util.call @__multiple_results_memoize_lookup(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer + %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%1) bindings([ + (%buffer : !hal.buffer)[%c0, %c8], + (%buffer_0 : !hal.buffer)[%c0, %c8], + (%transient_buffer : !hal.buffer)[%c0, %c128] + ]) flags("None") + %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32 + util.status.check_ok %status, "failed to wait on timepoint" + %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After CSE (cse) //----- // +util.func private @__multiple_results_memoize_apply(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} { + %c1 = arith.constant 1 : index + %c64_i32 = arith.constant 64 : i32 + %c128 = arith.constant 128 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %c0 = arith.constant 0 : index + %c0_i32 = arith.constant 0 : i32 + %c3 = arith.constant 3 : index + %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + %cmd = hal.command_buffer.create device(%arg0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%arg1) bindings(%c3) : !hal.command_buffer + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([ + (%c0 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([ + (%c1 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") + hal.command_buffer.finalize<%cmd : !hal.command_buffer> + util.return %cmd : !hal.command_buffer +} + +// -----// IR Dump After CSE (cse) //----- // +util.initializer { + %__device_0 = util.global.load @__device_0 : !hal.device + %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + util.return +} + +// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // +util.initializer { + %__device_0 = util.global.load @__device_0 : !hal.device + %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + util.return +} + +// -----// IR Dump After CSE (cse) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c-1_i32 = arith.constant -1 : i32 + %c0_i64 = arith.constant 0 : i64 + %0 = util.null : !hal.fence + %c-1_i64 = arith.constant -1 : i64 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32 + %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer + %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator + hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer + hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128} + %1 = util.call @__multiple_results_memoize_lookup(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer + %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%1) bindings([ + (%buffer : !hal.buffer)[%c0, %c8], + (%buffer_0 : !hal.buffer)[%c0, %c8], + (%transient_buffer : !hal.buffer)[%c0, %c128] + ]) flags("None") + %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32 + util.status.check_ok %status, "failed to wait on timepoint" + %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // +util.func private @__multiple_results_memoize_apply(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} { + %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + %c1 = arith.constant 1 : index + %c64_i32 = arith.constant 64 : i32 + %c128 = arith.constant 128 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %c0 = arith.constant 0 : index + %c0_i32 = arith.constant 0 : i32 + %c3 = arith.constant 3 : index + %cmd = hal.command_buffer.create device(%arg0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%arg1) bindings(%c3) : !hal.command_buffer + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([ + (%c0 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([ + (%c1 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") + hal.command_buffer.finalize<%cmd : !hal.command_buffer> + util.return %cmd : !hal.command_buffer +} + +// -----// IR Dump After Canonicalizer (canonicalize) //----- // +util.initializer { + %c-1_i64 = arith.constant -1 : i64 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %0 = util.call @__multiple_results_memoize_apply(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer + util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.return +} + +// -----// IR Dump After CSE (cse) //----- // +util.initializer { + %c-1_i64 = arith.constant -1 : i64 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %0 = util.call @__multiple_results_memoize_apply(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer + util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.return +} + +// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // +util.initializer { + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %c-1_i64 = arith.constant -1 : i64 + %0 = util.call @__multiple_results_memoize_apply(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer + util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.return +} + +// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // +util.initializer { + %0 = util.null : !hal.executable + %c14_i32 = arith.constant 14 : i32 + %c0 = arith.constant 0 : index + %c-1 = arith.constant -1 : index + %c-1_i64 = arith.constant -1 : i64 + %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + %__device_0 = util.global.load @__device_0 : !hal.device + %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index + %2 = arith.cmpi eq, %1, %c0 : index + cf.cond_br %2, ^bb1, ^bb2 +^bb1: // pred: ^bb0 + %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable + cf.br ^bb3(%executable : !hal.executable) +^bb2: // pred: ^bb0 + util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + cf.br ^bb3(%0 : !hal.executable) +^bb3(%3: !hal.executable): // 2 preds: ^bb1, ^bb2 + util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.return +} + +// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %c-1_i32 = arith.constant -1 : i32 + %c0_i64 = arith.constant 0 : i64 + %0 = util.null : !hal.fence + %c-1_i64 = arith.constant -1 : i64 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32 + %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32 + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer + %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator + hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer + hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128} + %1 = util.call @__multiple_results_memoize_lookup(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer + %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%1) bindings([ + (%buffer : !hal.buffer)[%c0, %c8], + (%buffer_0 : !hal.buffer)[%c0, %c8], + (%transient_buffer : !hal.buffer)[%c0, %c128] + ]) flags("None") + %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32 + util.status.check_ok %status, "failed to wait on timepoint" + %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After CSE (cse) //----- // +util.initializer { + %c18_i32 = arith.constant 18 : i32 + %false = arith.constant false + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %0 = util.null : !hal.device + %device_count = hal.devices.count : index + cf.br ^bb1(%c0, %c0, %0 : index, index, !hal.device) +^bb1(%1: index, %2: index, %3: !hal.device): // 2 preds: ^bb0, ^bb4 + %4 = util.cmp.eq %3, %0 : !hal.device + %5 = arith.cmpi slt, %1, %device_count : index + %6 = arith.andi %4, %5 : i1 + cf.cond_br %6, ^bb2(%1, %2 : index, index), ^bb5 +^bb2(%7: index, %8: index): // pred: ^bb1 + %device_n = hal.devices.get %7 : !hal.device + %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false + cf.cond_br %value, ^bb3, ^bb4(%false : i1) +^bb3: // pred: ^bb2 + %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + cf.br ^bb4(%value_1 : i1) +^bb4(%9: i1): // 2 preds: ^bb2, ^bb3 + %10 = arith.cmpi eq, %8, %c0 : index + %11 = arith.select %9, %c1, %c0 : index + %12 = arith.addi %8, %11 : index + %13 = arith.andi %9, %10 : i1 + %14 = arith.select %13, %device_n, %0 : !hal.device + %15 = arith.addi %7, %c1 : index + cf.br ^bb1(%15, %12, %14 : index, index, !hal.device) +^bb5: // pred: ^bb1 + cf.cond_br %4, ^bb6, ^bb7 +^bb6: // pred: ^bb5 + util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>" + cf.br ^bb7 +^bb7: // 2 preds: ^bb5, ^bb6 + util.global.store %3, @__device_0 : !hal.device + util.return +} + +// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // +util.initializer { + %__device_0 = util.global.load @__device_0 : !hal.device + %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + util.return +} + +// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // +util.func private @__multiple_results_memoize_apply(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} { + %c3 = arith.constant 3 : index + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c128 = arith.constant 128 : index + %c64_i32 = arith.constant 64 : i32 + %c1 = arith.constant 1 : index + %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + %cmd = hal.command_buffer.create device(%arg0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%arg1) bindings(%c3) : !hal.command_buffer + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([ + (%c0 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([ + (%c1 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") + hal.command_buffer.finalize<%cmd : !hal.command_buffer> + util.return %cmd : !hal.command_buffer +} + +// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32 + %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32 + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c64 = arith.constant 64 : index + %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index + %c-1_i64 = arith.constant -1 : i64 + %0 = util.null : !hal.fence + %c0_i64 = arith.constant 0 : i64 + %c-1_i32 = arith.constant -1 : i32 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer + %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator + hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer + hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128} + %1 = util.call @__multiple_results_memoize_lookup(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer + %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%1) bindings([ + (%buffer : !hal.buffer)[%c0, %c8], + (%buffer_0 : !hal.buffer)[%c0, %c8], + (%transient_buffer : !hal.buffer)[%c0, %c128] + ]) flags("None") + %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32 + util.status.check_ok %status, "failed to wait on timepoint" + %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // +util.initializer { + %c18_i32 = arith.constant 18 : i32 + %false = arith.constant false + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %0 = util.null : !hal.device + %device_count = hal.devices.count : index + cf.br ^bb1(%c0, %c0, %0 : index, index, !hal.device) +^bb1(%1: index, %2: index, %3: !hal.device): // 2 preds: ^bb0, ^bb4 + %4 = util.cmp.eq %3, %0 : !hal.device + %5 = arith.cmpi slt, %1, %device_count : index + %6 = arith.andi %4, %5 : i1 + cf.cond_br %6, ^bb2(%1, %2 : index, index), ^bb5 +^bb2(%7: index, %8: index): // pred: ^bb1 + %device_n = hal.devices.get %7 : !hal.device + %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false + cf.cond_br %value, ^bb3, ^bb4(%false : i1) +^bb3: // pred: ^bb2 + %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + cf.br ^bb4(%value_1 : i1) +^bb4(%9: i1): // 2 preds: ^bb2, ^bb3 + %10 = arith.cmpi eq, %8, %c0 : index + %11 = arith.select %9, %c1, %c0 : index + %12 = arith.addi %8, %11 : index + %13 = arith.andi %9, %10 : i1 + %14 = arith.select %13, %device_n, %0 : !hal.device + %15 = arith.addi %7, %c1 : index + cf.br ^bb1(%15, %12, %14 : index, index, !hal.device) +^bb5: // pred: ^bb1 + cf.cond_br %4, ^bb6, ^bb7 +^bb6: // pred: ^bb5 + util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>" + cf.br ^bb7 +^bb7: // 2 preds: ^bb5, ^bb6 + util.global.store %3, @__device_0 : !hal.device + util.return +} + +// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // +util.initializer { + %c-1_i64 = arith.constant -1 : i64 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %0 = util.call @__multiple_results_memoize_apply(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer + util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.return +} + +// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // +util.func private @__multiple_results_memoize_lookup(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer { + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + %0 = util.null : !hal.command_buffer + %1 = util.cmp.eq %arg0, %__device_0 : !hal.device + cf.cond_br %1, ^bb1, ^bb2(%0 : !hal.command_buffer) +^bb1: // pred: ^bb0 + cf.br ^bb2(%__multiple_results_memoize_result_0_device_0 : !hal.command_buffer) +^bb2(%2: !hal.command_buffer): // 2 preds: ^bb0, ^bb1 + util.return %2 : !hal.command_buffer +} + +// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // +util.initializer { + %c18_i32 = arith.constant 18 : i32 + %false = arith.constant false + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %0 = util.null : !hal.device + %device_count = hal.devices.count : index + cf.br ^bb1(%c0, %c0, %0 : index, index, !hal.device) +^bb1(%1: index, %2: index, %3: !hal.device): // 2 preds: ^bb0, ^bb4 + %4 = util.cmp.eq %3, %0 : !hal.device + %5 = arith.cmpi slt, %1, %device_count : index + %6 = arith.andi %4, %5 : i1 + cf.cond_br %6, ^bb2, ^bb5 +^bb2: // pred: ^bb1 + %device_n = hal.devices.get %1 : !hal.device + %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false + cf.cond_br %value, ^bb3, ^bb4(%false : i1) +^bb3: // pred: ^bb2 + %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + cf.br ^bb4(%value_1 : i1) +^bb4(%7: i1): // 2 preds: ^bb2, ^bb3 + %8 = arith.cmpi eq, %2, %c0 : index + %9 = arith.select %7, %c1, %c0 : index + %10 = arith.addi %2, %9 : index + %11 = arith.andi %7, %8 : i1 + %12 = arith.select %11, %device_n, %0 : !hal.device + %13 = arith.addi %1, %c1 : index + cf.br ^bb1(%13, %10, %12 : index, index, !hal.device) +^bb5: // pred: ^bb1 + cf.cond_br %4, ^bb6, ^bb7 +^bb6: // pred: ^bb5 + util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>" + cf.br ^bb7 +^bb7: // 2 preds: ^bb5, ^bb6 + util.global.store %3, @__device_0 : !hal.device + util.return +} + +// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // +util.func private @__multiple_results_memoize_lookup(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer { + %0 = util.null : !hal.command_buffer + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + %1 = util.cmp.eq %arg0, %__device_0 : !hal.device + %2 = arith.select %1, %__multiple_results_memoize_result_0_device_0, %0 : !hal.command_buffer + util.return %2 : !hal.command_buffer +} + +// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // +module attributes {iree.fixedpoint.iteration = 0 : index} { + util.global private @__device_0 : !hal.device + util.initializer { + %c18_i32 = arith.constant 18 : i32 + %false = arith.constant false + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %0 = util.null : !hal.device + %device_count = hal.devices.count : index + cf.br ^bb1(%c0, %c0, %0 : index, index, !hal.device) + ^bb1(%1: index, %2: index, %3: !hal.device): // 2 preds: ^bb0, ^bb4 + %4 = util.cmp.eq %3, %0 : !hal.device + %5 = arith.cmpi slt, %1, %device_count : index + %6 = arith.andi %4, %5 : i1 + cf.cond_br %6, ^bb2, ^bb5 + ^bb2: // pred: ^bb1 + %device_n = hal.devices.get %1 : !hal.device + %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false + cf.cond_br %value, ^bb3, ^bb4(%false : i1) + ^bb3: // pred: ^bb2 + %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + cf.br ^bb4(%value_1 : i1) + ^bb4(%7: i1): // 2 preds: ^bb2, ^bb3 + %8 = arith.cmpi eq, %2, %c0 : index + %9 = arith.select %7, %c1, %c0 : index + %10 = arith.addi %2, %9 : index + %11 = arith.andi %7, %8 : i1 + %12 = arith.select %11, %device_n, %0 : !hal.device + %13 = arith.addi %1, %c1 : index + cf.br ^bb1(%13, %10, %12 : index, index, !hal.device) + ^bb5: // pred: ^bb1 + cf.cond_br %4, ^bb6, ^bb7 + ^bb6: // pred: ^bb5 + util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>" + cf.br ^bb7 + ^bb7: // 2 preds: ^bb5, ^bb6 + util.global.store %3, @__device_0 : !hal.device + util.return + } + util.global private @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + util.initializer { + %__device_0 = util.global.load @__device_0 : !hal.device + %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + util.return + } + util.global private @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.initializer { + %0 = util.null : !hal.executable + %c14_i32 = arith.constant 14 : i32 + %c0 = arith.constant 0 : index + %c-1 = arith.constant -1 : index + %c-1_i64 = arith.constant -1 : i64 + %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + %__device_0 = util.global.load @__device_0 : !hal.device + %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index + %2 = arith.cmpi eq, %1, %c0 : index + cf.cond_br %2, ^bb1, ^bb2 + ^bb1: // pred: ^bb0 + %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable + cf.br ^bb3(%executable : !hal.executable) + ^bb2: // pred: ^bb0 + util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + cf.br ^bb3(%0 : !hal.executable) + ^bb3(%3: !hal.executable): // 2 preds: ^bb1, ^bb2 + util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.return + } + hal.executable private @multiple_results_dispatch_0 { + hal.executable.binary public @embedded_elf_arm_64 attributes {data = dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8>, format = "embedded-elf-arm_64", mime_type = "application/x-elf"} + } + util.func private @__multiple_results_memoize_apply(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} { + %c3 = arith.constant 3 : index + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c128 = arith.constant 128 : index + %c64_i32 = arith.constant 64 : i32 + %c1 = arith.constant 1 : index + %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + %cmd = hal.command_buffer.create device(%arg0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%arg1) bindings(%c3) : !hal.command_buffer + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([ + (%c0 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([ + (%c1 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") + hal.command_buffer.finalize<%cmd : !hal.command_buffer> + util.return %cmd : !hal.command_buffer + } + util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.initializer { + %c-1_i64 = arith.constant -1 : i64 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %0 = util.call @__multiple_results_memoize_apply(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer + util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.return + } + util.func private @__multiple_results_memoize_lookup(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer { + %0 = util.null : !hal.command_buffer + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + %1 = util.cmp.eq %arg0, %__device_0 : !hal.device + %2 = arith.select %1, %__multiple_results_memoize_result_0_device_0, %0 : !hal.command_buffer + util.return %2 : !hal.command_buffer + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32 + %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32 + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c64 = arith.constant 64 : index + %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index + %c-1_i64 = arith.constant -1 : i64 + %0 = util.null : !hal.fence + %c0_i64 = arith.constant 0 : i64 + %c-1_i32 = arith.constant -1 : i32 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer + %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator + hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer + hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128} + %1 = util.call @__multiple_results_memoize_lookup(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer + %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%1) bindings([ + (%buffer : !hal.buffer)[%c0, %c8], + (%buffer_0 : !hal.buffer)[%c0, %c8], + (%transient_buffer : !hal.buffer)[%c0, %c128] + ]) flags("None") + %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32 + util.status.check_ok %status, "failed to wait on timepoint" + %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- // +module attributes {iree.fixedpoint.iteration = 0 : index} { + util.global private @__device_0 : !hal.device + util.initializer { + %c18_i32 = arith.constant 18 : i32 + %false = arith.constant false + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %0 = util.null : !hal.device + %device_count = hal.devices.count : index + cf.br ^bb1(%c0, %c0, %0 : index, index, !hal.device) + ^bb1(%1: index, %2: index, %3: !hal.device): // 2 preds: ^bb0, ^bb4 + %4 = util.cmp.eq %3, %0 : !hal.device + %5 = arith.cmpi slt, %1, %device_count : index + %6 = arith.andi %4, %5 : i1 + cf.cond_br %6, ^bb2, ^bb5 + ^bb2: // pred: ^bb1 + %device_n = hal.devices.get %1 : !hal.device + %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false + cf.cond_br %value, ^bb3, ^bb4(%false : i1) + ^bb3: // pred: ^bb2 + %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + cf.br ^bb4(%value_1 : i1) + ^bb4(%7: i1): // 2 preds: ^bb2, ^bb3 + %8 = arith.cmpi eq, %2, %c0 : index + %9 = arith.select %7, %c1, %c0 : index + %10 = arith.addi %2, %9 : index + %11 = arith.andi %7, %8 : i1 + %12 = arith.select %11, %device_n, %0 : !hal.device + %13 = arith.addi %1, %c1 : index + cf.br ^bb1(%13, %10, %12 : index, index, !hal.device) + ^bb5: // pred: ^bb1 + cf.cond_br %4, ^bb6, ^bb7 + ^bb6: // pred: ^bb5 + util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>" + cf.br ^bb7 + ^bb7: // 2 preds: ^bb5, ^bb6 + util.global.store %3, @__device_0 : !hal.device + util.return + } + util.global private @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + util.initializer { + %__device_0 = util.global.load @__device_0 : !hal.device + %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + util.return + } + util.global private @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.initializer { + %0 = util.null : !hal.executable + %c14_i32 = arith.constant 14 : i32 + %c0 = arith.constant 0 : index + %c-1 = arith.constant -1 : index + %c-1_i64 = arith.constant -1 : i64 + %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + %__device_0 = util.global.load @__device_0 : !hal.device + %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index + %2 = arith.cmpi eq, %1, %c0 : index + cf.cond_br %2, ^bb1, ^bb2 + ^bb1: // pred: ^bb0 + %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable + cf.br ^bb3(%executable : !hal.executable) + ^bb2: // pred: ^bb0 + util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + cf.br ^bb3(%0 : !hal.executable) + ^bb3(%3: !hal.executable): // 2 preds: ^bb1, ^bb2 + util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.return + } + hal.executable private @multiple_results_dispatch_0 { + hal.executable.binary public @embedded_elf_arm_64 attributes {data = dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8>, format = "embedded-elf-arm_64", mime_type = "application/x-elf"} + } + util.func private @__multiple_results_memoize_apply(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} { + %c3 = arith.constant 3 : index + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c128 = arith.constant 128 : index + %c64_i32 = arith.constant 64 : i32 + %c1 = arith.constant 1 : index + %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + %cmd = hal.command_buffer.create device(%arg0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%arg1) bindings(%c3) : !hal.command_buffer + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([ + (%c0 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([ + (%c1 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") + hal.command_buffer.finalize<%cmd : !hal.command_buffer> + util.return %cmd : !hal.command_buffer + } + util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.initializer { + %c-1_i64 = arith.constant -1 : i64 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %0 = util.call @__multiple_results_memoize_apply(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer + util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.return + } + util.func private @__multiple_results_memoize_lookup(%arg0: !hal.device, %arg1: i64) -> !hal.command_buffer { + %0 = util.null : !hal.command_buffer + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + %1 = util.cmp.eq %arg0, %__device_0 : !hal.device + %2 = arith.select %1, %__multiple_results_memoize_result_0_device_0, %0 : !hal.command_buffer + util.return %2 : !hal.command_buffer + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32 + %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32 + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c64 = arith.constant 64 : index + %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index + %c-1_i64 = arith.constant -1 : i64 + %0 = util.null : !hal.fence + %c0_i64 = arith.constant 0 : i64 + %c-1_i32 = arith.constant -1 : i32 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer + %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator + hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer + hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128} + %1 = util.call @__multiple_results_memoize_lookup(%__device_0, %c-1_i64) : (!hal.device, i64) -> !hal.command_buffer + %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%1) bindings([ + (%buffer : !hal.buffer)[%c0, %c8], + (%buffer_0 : !hal.buffer)[%c0, %c8], + (%transient_buffer : !hal.buffer)[%c0, %c128] + ]) flags("None") + %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32 + util.status.check_ok %status, "failed to wait on timepoint" + %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After IPOPass (iree-util-ipo) //----- // +module attributes {iree.fixedpoint.iteration = 0 : index, iree.fixedpoint.modified} { + util.global private @__device_0 : !hal.device + util.initializer { + %c18_i32 = arith.constant 18 : i32 + %false = arith.constant false + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %0 = util.null : !hal.device + %device_count = hal.devices.count : index + cf.br ^bb1(%c0, %c0, %0 : index, index, !hal.device) + ^bb1(%1: index, %2: index, %3: !hal.device): // 2 preds: ^bb0, ^bb4 + %4 = util.cmp.eq %3, %0 : !hal.device + %5 = arith.cmpi slt, %1, %device_count : index + %6 = arith.andi %4, %5 : i1 + cf.cond_br %6, ^bb2, ^bb5 + ^bb2: // pred: ^bb1 + %device_n = hal.devices.get %1 : !hal.device + %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false + cf.cond_br %value, ^bb3, ^bb4(%false : i1) + ^bb3: // pred: ^bb2 + %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + cf.br ^bb4(%value_1 : i1) + ^bb4(%7: i1): // 2 preds: ^bb2, ^bb3 + %8 = arith.cmpi eq, %2, %c0 : index + %9 = arith.select %7, %c1, %c0 : index + %10 = arith.addi %2, %9 : index + %11 = arith.andi %7, %8 : i1 + %12 = arith.select %11, %device_n, %0 : !hal.device + %13 = arith.addi %1, %c1 : index + cf.br ^bb1(%13, %10, %12 : index, index, !hal.device) + ^bb5: // pred: ^bb1 + cf.cond_br %4, ^bb6, ^bb7 + ^bb6: // pred: ^bb5 + util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>" + cf.br ^bb7 + ^bb7: // 2 preds: ^bb5, ^bb6 + util.global.store %3, @__device_0 : !hal.device + util.return + } + util.global private @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + util.initializer { + %__device_0 = util.global.load @__device_0 : !hal.device + %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + util.return + } + util.global private @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.initializer { + %0 = util.null : !hal.executable + %c14_i32 = arith.constant 14 : i32 + %c0 = arith.constant 0 : index + %c-1 = arith.constant -1 : index + %c-1_i64 = arith.constant -1 : i64 + %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + %__device_0 = util.global.load @__device_0 : !hal.device + %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index + %2 = arith.cmpi eq, %1, %c0 : index + cf.cond_br %2, ^bb1, ^bb2 + ^bb1: // pred: ^bb0 + %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable + cf.br ^bb3(%executable : !hal.executable) + ^bb2: // pred: ^bb0 + util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + cf.br ^bb3(%0 : !hal.executable) + ^bb3(%3: !hal.executable): // 2 preds: ^bb1, ^bb2 + util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.return + } + hal.executable private @multiple_results_dispatch_0 { + hal.executable.binary public @embedded_elf_arm_64 attributes {data = dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8>, format = "embedded-elf-arm_64", mime_type = "application/x-elf"} + } + util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} { + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %c-1_i64 = arith.constant -1 : i64 + %c3 = arith.constant 3 : index + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c128 = arith.constant 128 : index + %c64_i32 = arith.constant 64 : i32 + %c1 = arith.constant 1 : index + %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([ + (%c0 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([ + (%c1 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") + hal.command_buffer.finalize<%cmd : !hal.command_buffer> + util.return %cmd : !hal.command_buffer + } + util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.initializer { + %c-1_i64 = arith.constant -1 : i64 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %0 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer + util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.return + } + util.func private @__multiple_results_memoize_lookup() -> !hal.command_buffer { + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %0 = util.null : !hal.command_buffer + %__device_0_0 = util.global.load immutable @__device_0 : !hal.device + %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + %1 = util.cmp.eq %__device_0, %__device_0_0 : !hal.device + %2 = arith.select %1, %__multiple_results_memoize_result_0_device_0, %0 : !hal.command_buffer + util.return %2 : !hal.command_buffer + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32 + %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32 + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c64 = arith.constant 64 : index + %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index + %c-1_i64 = arith.constant -1 : i64 + %0 = util.null : !hal.fence + %c0_i64 = arith.constant 0 : i64 + %c-1_i32 = arith.constant -1 : i32 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer + %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator + hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer + hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128} + %1 = util.call @__multiple_results_memoize_lookup() : () -> !hal.command_buffer + %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%1) bindings([ + (%buffer : !hal.buffer)[%c0, %c8], + (%buffer_0 : !hal.buffer)[%c0, %c8], + (%transient_buffer : !hal.buffer)[%c0, %c128] + ]) flags("None") + %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32 + util.status.check_ok %status, "failed to wait on timepoint" + %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After Canonicalizer (canonicalize) //----- // +util.initializer { + %__device_0 = util.global.load @__device_0 : !hal.device + %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + util.return +} + +// -----// IR Dump After CSE (cse) //----- // +util.initializer { + %__device_0 = util.global.load @__device_0 : !hal.device + %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + util.return +} + +// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // +util.initializer { + %__device_0 = util.global.load @__device_0 : !hal.device + %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + util.return +} + +// -----// IR Dump After Canonicalizer (canonicalize) //----- // +util.initializer { + %c18_i32 = arith.constant 18 : i32 + %false = arith.constant false + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %0 = util.null : !hal.device + %device_count = hal.devices.count : index + cf.br ^bb1(%c0, %c0, %0 : index, index, !hal.device) +^bb1(%1: index, %2: index, %3: !hal.device): // 2 preds: ^bb0, ^bb4 + %4 = util.cmp.eq %3, %0 : !hal.device + %5 = arith.cmpi slt, %1, %device_count : index + %6 = arith.andi %4, %5 : i1 + cf.cond_br %6, ^bb2, ^bb5 +^bb2: // pred: ^bb1 + %device_n = hal.devices.get %1 : !hal.device + %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false + cf.cond_br %value, ^bb3, ^bb4(%false : i1) +^bb3: // pred: ^bb2 + %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + cf.br ^bb4(%value_1 : i1) +^bb4(%7: i1): // 2 preds: ^bb2, ^bb3 + %8 = arith.cmpi eq, %2, %c0 : index + %9 = arith.select %7, %c1, %c0 : index + %10 = arith.addi %2, %9 : index + %11 = arith.andi %7, %8 : i1 + %12 = arith.select %11, %device_n, %0 : !hal.device + %13 = arith.addi %1, %c1 : index + cf.br ^bb1(%13, %10, %12 : index, index, !hal.device) +^bb5: // pred: ^bb1 + cf.cond_br %4, ^bb6, ^bb7 +^bb6: // pred: ^bb5 + util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>" + cf.br ^bb7 +^bb7: // 2 preds: ^bb5, ^bb6 + util.global.store %3, @__device_0 : !hal.device + util.return +} + +// -----// IR Dump After Canonicalizer (canonicalize) //----- // +util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} { + %c1 = arith.constant 1 : index + %c64_i32 = arith.constant 64 : i32 + %c128 = arith.constant 128 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %c0 = arith.constant 0 : index + %c0_i32 = arith.constant 0 : i32 + %c3 = arith.constant 3 : index + %c-1_i64 = arith.constant -1 : i64 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([ + (%c0 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([ + (%c1 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") + hal.command_buffer.finalize<%cmd : !hal.command_buffer> + util.return %cmd : !hal.command_buffer +} + +// -----// IR Dump After Canonicalizer (canonicalize) //----- // +util.initializer { + %0 = util.null : !hal.executable + %c14_i32 = arith.constant 14 : i32 + %c0 = arith.constant 0 : index + %c-1 = arith.constant -1 : index + %c-1_i64 = arith.constant -1 : i64 + %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + %__device_0 = util.global.load @__device_0 : !hal.device + %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index + %2 = arith.cmpi eq, %1, %c0 : index + cf.cond_br %2, ^bb1, ^bb2 +^bb1: // pred: ^bb0 + %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable + cf.br ^bb3(%executable : !hal.executable) +^bb2: // pred: ^bb0 + util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + cf.br ^bb3(%0 : !hal.executable) +^bb3(%3: !hal.executable): // 2 preds: ^bb1, ^bb2 + util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.return +} + +// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // +util.initializer { + %__device_0 = util.global.load @__device_0 : !hal.device + %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + util.return +} + +// -----// IR Dump After Canonicalizer (canonicalize) //----- // +util.func private @__multiple_results_memoize_lookup() -> !hal.command_buffer { + %0 = util.null : !hal.command_buffer + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %__device_0_0 = util.global.load immutable @__device_0 : !hal.device + %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + %1 = util.cmp.eq %__device_0, %__device_0_0 : !hal.device + %2 = arith.select %1, %__multiple_results_memoize_result_0_device_0, %0 : !hal.command_buffer + util.return %2 : !hal.command_buffer +} + +// -----// IR Dump After Canonicalizer (canonicalize) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32 + %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32 + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c64 = arith.constant 64 : index + %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index + %c-1_i64 = arith.constant -1 : i64 + %0 = util.null : !hal.fence + %c0_i64 = arith.constant 0 : i64 + %c-1_i32 = arith.constant -1 : i32 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer + %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator + hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer + hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128} + %1 = util.call @__multiple_results_memoize_lookup() : () -> !hal.command_buffer + %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%1) bindings([ + (%buffer : !hal.buffer)[%c0, %c8], + (%buffer_0 : !hal.buffer)[%c0, %c8], + (%transient_buffer : !hal.buffer)[%c0, %c128] + ]) flags("None") + %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32 + util.status.check_ok %status, "failed to wait on timepoint" + %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After CSE (cse) //----- // +util.initializer { + %c18_i32 = arith.constant 18 : i32 + %false = arith.constant false + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %0 = util.null : !hal.device + %device_count = hal.devices.count : index + cf.br ^bb1(%c0, %c0, %0 : index, index, !hal.device) +^bb1(%1: index, %2: index, %3: !hal.device): // 2 preds: ^bb0, ^bb4 + %4 = util.cmp.eq %3, %0 : !hal.device + %5 = arith.cmpi slt, %1, %device_count : index + %6 = arith.andi %4, %5 : i1 + cf.cond_br %6, ^bb2, ^bb5 +^bb2: // pred: ^bb1 + %device_n = hal.devices.get %1 : !hal.device + %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false + cf.cond_br %value, ^bb3, ^bb4(%false : i1) +^bb3: // pred: ^bb2 + %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + cf.br ^bb4(%value_1 : i1) +^bb4(%7: i1): // 2 preds: ^bb2, ^bb3 + %8 = arith.cmpi eq, %2, %c0 : index + %9 = arith.select %7, %c1, %c0 : index + %10 = arith.addi %2, %9 : index + %11 = arith.andi %7, %8 : i1 + %12 = arith.select %11, %device_n, %0 : !hal.device + %13 = arith.addi %1, %c1 : index + cf.br ^bb1(%13, %10, %12 : index, index, !hal.device) +^bb5: // pred: ^bb1 + cf.cond_br %4, ^bb6, ^bb7 +^bb6: // pred: ^bb5 + util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>" + cf.br ^bb7 +^bb7: // 2 preds: ^bb5, ^bb6 + util.global.store %3, @__device_0 : !hal.device + util.return +} + +// -----// IR Dump After CSE (cse) //----- // +util.initializer { + %0 = util.null : !hal.executable + %c14_i32 = arith.constant 14 : i32 + %c0 = arith.constant 0 : index + %c-1 = arith.constant -1 : index + %c-1_i64 = arith.constant -1 : i64 + %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + %__device_0 = util.global.load @__device_0 : !hal.device + %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index + %2 = arith.cmpi eq, %1, %c0 : index + cf.cond_br %2, ^bb1, ^bb2 +^bb1: // pred: ^bb0 + %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable + cf.br ^bb3(%executable : !hal.executable) +^bb2: // pred: ^bb0 + util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + cf.br ^bb3(%0 : !hal.executable) +^bb3(%3: !hal.executable): // 2 preds: ^bb1, ^bb2 + util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.return +} + +// -----// IR Dump After Canonicalizer (canonicalize) //----- // +util.initializer { + %0 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer + util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.return +} + +// -----// IR Dump After CSE (cse) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32 + %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32 + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c64 = arith.constant 64 : index + %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index + %c-1_i64 = arith.constant -1 : i64 + %0 = util.null : !hal.fence + %c0_i64 = arith.constant 0 : i64 + %c-1_i32 = arith.constant -1 : i32 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer + %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator + hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer + hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128} + %1 = util.call @__multiple_results_memoize_lookup() : () -> !hal.command_buffer + %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%1) bindings([ + (%buffer : !hal.buffer)[%c0, %c8], + (%buffer_0 : !hal.buffer)[%c0, %c8], + (%transient_buffer : !hal.buffer)[%c0, %c128] + ]) flags("None") + %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32 + util.status.check_ok %status, "failed to wait on timepoint" + %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After CSE (cse) //----- // +util.func private @__multiple_results_memoize_lookup() -> !hal.command_buffer { + %0 = util.null : !hal.command_buffer + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + %1 = util.cmp.eq %__device_0, %__device_0 : !hal.device + %2 = arith.select %1, %__multiple_results_memoize_result_0_device_0, %0 : !hal.command_buffer + util.return %2 : !hal.command_buffer +} + +// -----// IR Dump After CSE (cse) //----- // +util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} { + %c1 = arith.constant 1 : index + %c64_i32 = arith.constant 64 : i32 + %c128 = arith.constant 128 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %c0 = arith.constant 0 : index + %c0_i32 = arith.constant 0 : i32 + %c3 = arith.constant 3 : index + %c-1_i64 = arith.constant -1 : i64 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([ + (%c0 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([ + (%c1 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") + hal.command_buffer.finalize<%cmd : !hal.command_buffer> + util.return %cmd : !hal.command_buffer +} + +// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // +util.initializer { + %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + %__device_0 = util.global.load @__device_0 : !hal.device + %0 = util.null : !hal.executable + %c14_i32 = arith.constant 14 : i32 + %c0 = arith.constant 0 : index + %c-1 = arith.constant -1 : index + %c-1_i64 = arith.constant -1 : i64 + %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index + %2 = arith.cmpi eq, %1, %c0 : index + cf.cond_br %2, ^bb1, ^bb2 +^bb1: // pred: ^bb0 + %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable + cf.br ^bb3(%executable : !hal.executable) +^bb2: // pred: ^bb0 + util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + cf.br ^bb3(%0 : !hal.executable) +^bb3(%3: !hal.executable): // 2 preds: ^bb1, ^bb2 + util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.return +} + +// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32 + %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32 + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c64 = arith.constant 64 : index + %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index + %c-1_i64 = arith.constant -1 : i64 + %0 = util.null : !hal.fence + %c0_i64 = arith.constant 0 : i64 + %c-1_i32 = arith.constant -1 : i32 + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer + %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator + hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer + hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128} + %1 = util.call @__multiple_results_memoize_lookup() : () -> !hal.command_buffer + %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%1) bindings([ + (%buffer : !hal.buffer)[%c0, %c8], + (%buffer_0 : !hal.buffer)[%c0, %c8], + (%transient_buffer : !hal.buffer)[%c0, %c128] + ]) flags("None") + %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32 + util.status.check_ok %status, "failed to wait on timepoint" + %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // +util.initializer { + %c18_i32 = arith.constant 18 : i32 + %false = arith.constant false + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %0 = util.null : !hal.device + %device_count = hal.devices.count : index + cf.br ^bb1(%c0, %c0, %0 : index, index, !hal.device) +^bb1(%1: index, %2: index, %3: !hal.device): // 2 preds: ^bb0, ^bb4 + %4 = util.cmp.eq %3, %0 : !hal.device + %5 = arith.cmpi slt, %1, %device_count : index + %6 = arith.andi %4, %5 : i1 + cf.cond_br %6, ^bb2, ^bb5 +^bb2: // pred: ^bb1 + %device_n = hal.devices.get %1 : !hal.device + %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false + cf.cond_br %value, ^bb3, ^bb4(%false : i1) +^bb3: // pred: ^bb2 + %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + cf.br ^bb4(%value_1 : i1) +^bb4(%7: i1): // 2 preds: ^bb2, ^bb3 + %8 = arith.cmpi eq, %2, %c0 : index + %9 = arith.select %7, %c1, %c0 : index + %10 = arith.addi %2, %9 : index + %11 = arith.andi %7, %8 : i1 + %12 = arith.select %11, %device_n, %0 : !hal.device + %13 = arith.addi %1, %c1 : index + cf.br ^bb1(%13, %10, %12 : index, index, !hal.device) +^bb5: // pred: ^bb1 + cf.cond_br %4, ^bb6, ^bb7 +^bb6: // pred: ^bb5 + util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>" + cf.br ^bb7 +^bb7: // 2 preds: ^bb5, ^bb6 + util.global.store %3, @__device_0 : !hal.device + util.return +} + +// -----// IR Dump After CSE (cse) //----- // +util.initializer { + %0 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer + util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.return +} + +// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // +util.initializer { + %0 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer + util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.return +} + +// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // +util.func private @__multiple_results_memoize_lookup() -> !hal.command_buffer { + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + %0 = util.null : !hal.command_buffer + %1 = util.cmp.eq %__device_0, %__device_0 : !hal.device + %2 = arith.select %1, %__multiple_results_memoize_result_0_device_0, %0 : !hal.command_buffer + util.return %2 : !hal.command_buffer +} + +// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // +util.initializer { + %c-1_i64 = arith.constant -1 : i64 + %c-1 = arith.constant -1 : index + %c0 = arith.constant 0 : index + %c14_i32 = arith.constant 14 : i32 + %0 = util.null : !hal.executable + %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + %__device_0 = util.global.load @__device_0 : !hal.device + %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index + %2 = arith.cmpi eq, %1, %c0 : index + cf.cond_br %2, ^bb1, ^bb2 +^bb1: // pred: ^bb0 + %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable + cf.br ^bb3(%executable : !hal.executable) +^bb2: // pred: ^bb0 + util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + cf.br ^bb3(%0 : !hal.executable) +^bb3(%3: !hal.executable): // 2 preds: ^bb1, ^bb2 + util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.return +} + +// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c-1_i32 = arith.constant -1 : i32 + %c0_i64 = arith.constant 0 : i64 + %0 = util.null : !hal.fence + %c-1_i64 = arith.constant -1 : i64 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32 + %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer + %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator + hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer + hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128} + %1 = util.call @__multiple_results_memoize_lookup() : () -> !hal.command_buffer + %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%1) bindings([ + (%buffer : !hal.buffer)[%c0, %c8], + (%buffer_0 : !hal.buffer)[%c0, %c8], + (%transient_buffer : !hal.buffer)[%c0, %c128] + ]) flags("None") + %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32 + util.status.check_ok %status, "failed to wait on timepoint" + %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // +util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} { + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + %c1 = arith.constant 1 : index + %c64_i32 = arith.constant 64 : i32 + %c128 = arith.constant 128 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %c0 = arith.constant 0 : index + %c0_i32 = arith.constant 0 : i32 + %c3 = arith.constant 3 : index + %c-1_i64 = arith.constant -1 : i64 + %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([ + (%c0 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([ + (%c1 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") + hal.command_buffer.finalize<%cmd : !hal.command_buffer> + util.return %cmd : !hal.command_buffer +} + +// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // +util.func private @__multiple_results_memoize_lookup() -> !hal.command_buffer { + %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.return %__multiple_results_memoize_result_0_device_0 : !hal.command_buffer +} + +// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // +util.initializer { + %0 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer + util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.return +} + +// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // +util.initializer { + %c18_i32 = arith.constant 18 : i32 + %false = arith.constant false + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %0 = util.null : !hal.device + %device_count = hal.devices.count : index + cf.br ^bb1(%c0, %c0, %0 : index, index, !hal.device) +^bb1(%1: index, %2: index, %3: !hal.device): // 2 preds: ^bb0, ^bb4 + %4 = util.cmp.eq %3, %0 : !hal.device + %5 = arith.cmpi slt, %1, %device_count : index + %6 = arith.andi %4, %5 : i1 + cf.cond_br %6, ^bb2, ^bb5 +^bb2: // pred: ^bb1 + %device_n = hal.devices.get %1 : !hal.device + %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false + cf.cond_br %value, ^bb3, ^bb4(%false : i1) +^bb3: // pred: ^bb2 + %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + cf.br ^bb4(%value_1 : i1) +^bb4(%7: i1): // 2 preds: ^bb2, ^bb3 + %8 = arith.cmpi eq, %2, %c0 : index + %9 = arith.select %7, %c1, %c0 : index + %10 = arith.addi %2, %9 : index + %11 = arith.andi %7, %8 : i1 + %12 = arith.select %11, %device_n, %0 : !hal.device + %13 = arith.addi %1, %c1 : index + cf.br ^bb1(%13, %10, %12 : index, index, !hal.device) +^bb5: // pred: ^bb1 + cf.cond_br %4, ^bb6, ^bb7 +^bb6: // pred: ^bb5 + util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>" + cf.br ^bb7 +^bb7: // 2 preds: ^bb5, ^bb6 + util.global.store %3, @__device_0 : !hal.device + util.return +} + +// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // +util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} { + %c-1_i64 = arith.constant -1 : i64 + %c3 = arith.constant 3 : index + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c128 = arith.constant 128 : index + %c64_i32 = arith.constant 64 : i32 + %c1 = arith.constant 1 : index + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([ + (%c0 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([ + (%c1 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") + hal.command_buffer.finalize<%cmd : !hal.command_buffer> + util.return %cmd : !hal.command_buffer +} + +// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // +module attributes {iree.fixedpoint.iteration = 1 : index} { + util.global private @__device_0 : !hal.device + util.initializer { + %c18_i32 = arith.constant 18 : i32 + %false = arith.constant false + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %0 = util.null : !hal.device + %device_count = hal.devices.count : index + cf.br ^bb1(%c0, %c0, %0 : index, index, !hal.device) + ^bb1(%1: index, %2: index, %3: !hal.device): // 2 preds: ^bb0, ^bb4 + %4 = util.cmp.eq %3, %0 : !hal.device + %5 = arith.cmpi slt, %1, %device_count : index + %6 = arith.andi %4, %5 : i1 + cf.cond_br %6, ^bb2, ^bb5 + ^bb2: // pred: ^bb1 + %device_n = hal.devices.get %1 : !hal.device + %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false + cf.cond_br %value, ^bb3, ^bb4(%false : i1) + ^bb3: // pred: ^bb2 + %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + cf.br ^bb4(%value_1 : i1) + ^bb4(%7: i1): // 2 preds: ^bb2, ^bb3 + %8 = arith.cmpi eq, %2, %c0 : index + %9 = arith.select %7, %c1, %c0 : index + %10 = arith.addi %2, %9 : index + %11 = arith.andi %7, %8 : i1 + %12 = arith.select %11, %device_n, %0 : !hal.device + %13 = arith.addi %1, %c1 : index + cf.br ^bb1(%13, %10, %12 : index, index, !hal.device) + ^bb5: // pred: ^bb1 + cf.cond_br %4, ^bb6, ^bb7 + ^bb6: // pred: ^bb5 + util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>" + cf.br ^bb7 + ^bb7: // 2 preds: ^bb5, ^bb6 + util.global.store %3, @__device_0 : !hal.device + util.return + } + util.global private @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + util.initializer { + %__device_0 = util.global.load @__device_0 : !hal.device + %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + util.return + } + util.global private @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.initializer { + %c-1_i64 = arith.constant -1 : i64 + %c-1 = arith.constant -1 : index + %c0 = arith.constant 0 : index + %c14_i32 = arith.constant 14 : i32 + %0 = util.null : !hal.executable + %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + %__device_0 = util.global.load @__device_0 : !hal.device + %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index + %2 = arith.cmpi eq, %1, %c0 : index + cf.cond_br %2, ^bb1, ^bb2 + ^bb1: // pred: ^bb0 + %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable + cf.br ^bb3(%executable : !hal.executable) + ^bb2: // pred: ^bb0 + util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + cf.br ^bb3(%0 : !hal.executable) + ^bb3(%3: !hal.executable): // 2 preds: ^bb1, ^bb2 + util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.return + } + hal.executable private @multiple_results_dispatch_0 { + hal.executable.binary public @embedded_elf_arm_64 attributes {data = dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8>, format = "embedded-elf-arm_64", mime_type = "application/x-elf"} + } + util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} { + %c-1_i64 = arith.constant -1 : i64 + %c3 = arith.constant 3 : index + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c128 = arith.constant 128 : index + %c64_i32 = arith.constant 64 : i32 + %c1 = arith.constant 1 : index + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([ + (%c0 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([ + (%c1 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") + hal.command_buffer.finalize<%cmd : !hal.command_buffer> + util.return %cmd : !hal.command_buffer + } + util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.initializer { + %0 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer + util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.return + } + util.func private @__multiple_results_memoize_lookup() -> !hal.command_buffer { + %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.return %__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c-1_i32 = arith.constant -1 : i32 + %c0_i64 = arith.constant 0 : i64 + %0 = util.null : !hal.fence + %c-1_i64 = arith.constant -1 : i64 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32 + %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer + %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator + hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer + hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128} + %1 = util.call @__multiple_results_memoize_lookup() : () -> !hal.command_buffer + %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%1) bindings([ + (%buffer : !hal.buffer)[%c0, %c8], + (%buffer_0 : !hal.buffer)[%c0, %c8], + (%transient_buffer : !hal.buffer)[%c0, %c128] + ]) flags("None") + %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32 + util.status.check_ok %status, "failed to wait on timepoint" + %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- // +module attributes {iree.fixedpoint.iteration = 1 : index} { + util.global private @__device_0 : !hal.device + util.initializer { + %c18_i32 = arith.constant 18 : i32 + %false = arith.constant false + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %0 = util.null : !hal.device + %device_count = hal.devices.count : index + cf.br ^bb1(%c0, %c0, %0 : index, index, !hal.device) + ^bb1(%1: index, %2: index, %3: !hal.device): // 2 preds: ^bb0, ^bb4 + %4 = util.cmp.eq %3, %0 : !hal.device + %5 = arith.cmpi slt, %1, %device_count : index + %6 = arith.andi %4, %5 : i1 + cf.cond_br %6, ^bb2, ^bb5 + ^bb2: // pred: ^bb1 + %device_n = hal.devices.get %1 : !hal.device + %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false + cf.cond_br %value, ^bb3, ^bb4(%false : i1) + ^bb3: // pred: ^bb2 + %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + cf.br ^bb4(%value_1 : i1) + ^bb4(%7: i1): // 2 preds: ^bb2, ^bb3 + %8 = arith.cmpi eq, %2, %c0 : index + %9 = arith.select %7, %c1, %c0 : index + %10 = arith.addi %2, %9 : index + %11 = arith.andi %7, %8 : i1 + %12 = arith.select %11, %device_n, %0 : !hal.device + %13 = arith.addi %1, %c1 : index + cf.br ^bb1(%13, %10, %12 : index, index, !hal.device) + ^bb5: // pred: ^bb1 + cf.cond_br %4, ^bb6, ^bb7 + ^bb6: // pred: ^bb5 + util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>" + cf.br ^bb7 + ^bb7: // 2 preds: ^bb5, ^bb6 + util.global.store %3, @__device_0 : !hal.device + util.return + } + util.global private @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + util.initializer { + %__device_0 = util.global.load @__device_0 : !hal.device + %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + util.return + } + util.global private @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.initializer { + %c-1_i64 = arith.constant -1 : i64 + %c-1 = arith.constant -1 : index + %c0 = arith.constant 0 : index + %c14_i32 = arith.constant 14 : i32 + %0 = util.null : !hal.executable + %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + %__device_0 = util.global.load @__device_0 : !hal.device + %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index + %2 = arith.cmpi eq, %1, %c0 : index + cf.cond_br %2, ^bb1, ^bb2 + ^bb1: // pred: ^bb0 + %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable + cf.br ^bb3(%executable : !hal.executable) + ^bb2: // pred: ^bb0 + util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + cf.br ^bb3(%0 : !hal.executable) + ^bb3(%3: !hal.executable): // 2 preds: ^bb1, ^bb2 + util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.return + } + hal.executable private @multiple_results_dispatch_0 { + hal.executable.binary public @embedded_elf_arm_64 attributes {data = dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8>, format = "embedded-elf-arm_64", mime_type = "application/x-elf"} + } + util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} { + %c-1_i64 = arith.constant -1 : i64 + %c3 = arith.constant 3 : index + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c128 = arith.constant 128 : index + %c64_i32 = arith.constant 64 : i32 + %c1 = arith.constant 1 : index + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([ + (%c0 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([ + (%c1 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") + hal.command_buffer.finalize<%cmd : !hal.command_buffer> + util.return %cmd : !hal.command_buffer + } + util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.initializer { + %0 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer + util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.return + } + util.func private @__multiple_results_memoize_lookup() -> !hal.command_buffer { + %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.return %__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c-1_i32 = arith.constant -1 : i32 + %c0_i64 = arith.constant 0 : i64 + %0 = util.null : !hal.fence + %c-1_i64 = arith.constant -1 : i64 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32 + %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer + %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator + hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer + hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128} + %1 = util.call @__multiple_results_memoize_lookup() : () -> !hal.command_buffer + %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%1) bindings([ + (%buffer : !hal.buffer)[%c0, %c8], + (%buffer_0 : !hal.buffer)[%c0, %c8], + (%transient_buffer : !hal.buffer)[%c0, %c128] + ]) flags("None") + %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32 + util.status.check_ok %status, "failed to wait on timepoint" + %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After IPOPass (iree-util-ipo) //----- // +module attributes {iree.fixedpoint.iteration = 1 : index, iree.fixedpoint.modified} { + util.global private @__device_0 : !hal.device + util.initializer { + %c18_i32 = arith.constant 18 : i32 + %false = arith.constant false + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %0 = util.null : !hal.device + %device_count = hal.devices.count : index + cf.br ^bb1(%c0, %c0, %0 : index, index, !hal.device) + ^bb1(%1: index, %2: index, %3: !hal.device): // 2 preds: ^bb0, ^bb4 + %4 = util.cmp.eq %3, %0 : !hal.device + %5 = arith.cmpi slt, %1, %device_count : index + %6 = arith.andi %4, %5 : i1 + cf.cond_br %6, ^bb2, ^bb5 + ^bb2: // pred: ^bb1 + %device_n = hal.devices.get %1 : !hal.device + %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false + cf.cond_br %value, ^bb3, ^bb4(%false : i1) + ^bb3: // pred: ^bb2 + %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + cf.br ^bb4(%value_1 : i1) + ^bb4(%7: i1): // 2 preds: ^bb2, ^bb3 + %8 = arith.cmpi eq, %2, %c0 : index + %9 = arith.select %7, %c1, %c0 : index + %10 = arith.addi %2, %9 : index + %11 = arith.andi %7, %8 : i1 + %12 = arith.select %11, %device_n, %0 : !hal.device + %13 = arith.addi %1, %c1 : index + cf.br ^bb1(%13, %10, %12 : index, index, !hal.device) + ^bb5: // pred: ^bb1 + cf.cond_br %4, ^bb6, ^bb7 + ^bb6: // pred: ^bb5 + util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>" + cf.br ^bb7 + ^bb7: // 2 preds: ^bb5, ^bb6 + util.global.store %3, @__device_0 : !hal.device + util.return + } + util.global private @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + util.initializer { + %__device_0 = util.global.load @__device_0 : !hal.device + %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + util.return + } + util.global private @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.initializer { + %c-1_i64 = arith.constant -1 : i64 + %c-1 = arith.constant -1 : index + %c0 = arith.constant 0 : index + %c14_i32 = arith.constant 14 : i32 + %0 = util.null : !hal.executable + %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + %__device_0 = util.global.load @__device_0 : !hal.device + %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index + %2 = arith.cmpi eq, %1, %c0 : index + cf.cond_br %2, ^bb1, ^bb2 + ^bb1: // pred: ^bb0 + %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable + cf.br ^bb3(%executable : !hal.executable) + ^bb2: // pred: ^bb0 + util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + cf.br ^bb3(%0 : !hal.executable) + ^bb3(%3: !hal.executable): // 2 preds: ^bb1, ^bb2 + util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.return + } + hal.executable private @multiple_results_dispatch_0 { + hal.executable.binary public @embedded_elf_arm_64 attributes {data = dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8>, format = "embedded-elf-arm_64", mime_type = "application/x-elf"} + } + util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} { + %c-1_i64 = arith.constant -1 : i64 + %c3 = arith.constant 3 : index + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c128 = arith.constant 128 : index + %c64_i32 = arith.constant 64 : i32 + %c1 = arith.constant 1 : index + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([ + (%c0 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([ + (%c1 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") + hal.command_buffer.finalize<%cmd : !hal.command_buffer> + util.return %cmd : !hal.command_buffer + } + util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.initializer { + %0 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer + util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.return + } + util.func private @__multiple_results_memoize_lookup() { + %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.return + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c-1_i32 = arith.constant -1 : i32 + %c0_i64 = arith.constant 0 : i64 + %0 = util.null : !hal.fence + %c-1_i64 = arith.constant -1 : i64 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32 + %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer + %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator + hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer + hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128} + util.call @__multiple_results_memoize_lookup() : () -> () + %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%__multiple_results_memoize_result_0_device_0) bindings([ + (%buffer : !hal.buffer)[%c0, %c8], + (%buffer_0 : !hal.buffer)[%c0, %c8], + (%transient_buffer : !hal.buffer)[%c0, %c128] + ]) flags("None") + %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32 + util.status.check_ok %status, "failed to wait on timepoint" + %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After Canonicalizer (canonicalize) //----- // +util.initializer { + %__device_0 = util.global.load @__device_0 : !hal.device + %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + util.return +} + +// -----// IR Dump After CSE (cse) //----- // +util.initializer { + %__device_0 = util.global.load @__device_0 : !hal.device + %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + util.return +} + +// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // +util.initializer { + %__device_0 = util.global.load @__device_0 : !hal.device + %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + util.return +} + +// -----// IR Dump After Canonicalizer (canonicalize) //----- // +util.initializer { + %c-1_i64 = arith.constant -1 : i64 + %c-1 = arith.constant -1 : index + %c0 = arith.constant 0 : index + %c14_i32 = arith.constant 14 : i32 + %0 = util.null : !hal.executable + %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + %__device_0 = util.global.load @__device_0 : !hal.device + %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index + %2 = arith.cmpi eq, %1, %c0 : index + cf.cond_br %2, ^bb1, ^bb2 +^bb1: // pred: ^bb0 + %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable + cf.br ^bb3(%executable : !hal.executable) +^bb2: // pred: ^bb0 + util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + cf.br ^bb3(%0 : !hal.executable) +^bb3(%3: !hal.executable): // 2 preds: ^bb1, ^bb2 + util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.return +} + +// -----// IR Dump After Canonicalizer (canonicalize) //----- // +util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} { + %c-1_i64 = arith.constant -1 : i64 + %c3 = arith.constant 3 : index + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c128 = arith.constant 128 : index + %c64_i32 = arith.constant 64 : i32 + %c1 = arith.constant 1 : index + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([ + (%c0 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([ + (%c1 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") + hal.command_buffer.finalize<%cmd : !hal.command_buffer> + util.return %cmd : !hal.command_buffer +} + +// -----// IR Dump After Canonicalizer (canonicalize) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c-1_i32 = arith.constant -1 : i32 + %c0_i64 = arith.constant 0 : i64 + %0 = util.null : !hal.fence + %c-1_i64 = arith.constant -1 : i64 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32 + %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer + %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator + hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer + hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128} + util.call @__multiple_results_memoize_lookup() : () -> () + %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%__multiple_results_memoize_result_0_device_0) bindings([ + (%buffer : !hal.buffer)[%c0, %c8], + (%buffer_0 : !hal.buffer)[%c0, %c8], + (%transient_buffer : !hal.buffer)[%c0, %c128] + ]) flags("None") + %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32 + util.status.check_ok %status, "failed to wait on timepoint" + %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After Canonicalizer (canonicalize) //----- // +util.func private @__multiple_results_memoize_lookup() { + util.return +} + +// -----// IR Dump After CSE (cse) //----- // +util.func private @__multiple_results_memoize_lookup() { + util.return +} + +// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // +util.func private @__multiple_results_memoize_lookup() { + util.return +} + +// -----// IR Dump After CSE (cse) //----- // +util.initializer { + %c-1_i64 = arith.constant -1 : i64 + %c-1 = arith.constant -1 : index + %c0 = arith.constant 0 : index + %c14_i32 = arith.constant 14 : i32 + %0 = util.null : !hal.executable + %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + %__device_0 = util.global.load @__device_0 : !hal.device + %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index + %2 = arith.cmpi eq, %1, %c0 : index + cf.cond_br %2, ^bb1, ^bb2 +^bb1: // pred: ^bb0 + %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable + cf.br ^bb3(%executable : !hal.executable) +^bb2: // pred: ^bb0 + util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + cf.br ^bb3(%0 : !hal.executable) +^bb3(%3: !hal.executable): // 2 preds: ^bb1, ^bb2 + util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.return +} + +// -----// IR Dump After Canonicalizer (canonicalize) //----- // +util.initializer { + %c18_i32 = arith.constant 18 : i32 + %false = arith.constant false + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %0 = util.null : !hal.device + %device_count = hal.devices.count : index + cf.br ^bb1(%c0, %c0, %0 : index, index, !hal.device) +^bb1(%1: index, %2: index, %3: !hal.device): // 2 preds: ^bb0, ^bb4 + %4 = util.cmp.eq %3, %0 : !hal.device + %5 = arith.cmpi slt, %1, %device_count : index + %6 = arith.andi %4, %5 : i1 + cf.cond_br %6, ^bb2, ^bb5 +^bb2: // pred: ^bb1 + %device_n = hal.devices.get %1 : !hal.device + %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false + cf.cond_br %value, ^bb3, ^bb4(%false : i1) +^bb3: // pred: ^bb2 + %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + cf.br ^bb4(%value_1 : i1) +^bb4(%7: i1): // 2 preds: ^bb2, ^bb3 + %8 = arith.cmpi eq, %2, %c0 : index + %9 = arith.select %7, %c1, %c0 : index + %10 = arith.addi %2, %9 : index + %11 = arith.andi %7, %8 : i1 + %12 = arith.select %11, %device_n, %0 : !hal.device + %13 = arith.addi %1, %c1 : index + cf.br ^bb1(%13, %10, %12 : index, index, !hal.device) +^bb5: // pred: ^bb1 + cf.cond_br %4, ^bb6, ^bb7 +^bb6: // pred: ^bb5 + util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>" + cf.br ^bb7 +^bb7: // 2 preds: ^bb5, ^bb6 + util.global.store %3, @__device_0 : !hal.device + util.return +} + +// -----// IR Dump After CSE (cse) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c-1_i32 = arith.constant -1 : i32 + %c0_i64 = arith.constant 0 : i64 + %0 = util.null : !hal.fence + %c-1_i64 = arith.constant -1 : i64 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32 + %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer + %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator + hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer + hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128} + util.call @__multiple_results_memoize_lookup() : () -> () + %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%__multiple_results_memoize_result_0_device_0) bindings([ + (%buffer : !hal.buffer)[%c0, %c8], + (%buffer_0 : !hal.buffer)[%c0, %c8], + (%transient_buffer : !hal.buffer)[%c0, %c128] + ]) flags("None") + %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32 + util.status.check_ok %status, "failed to wait on timepoint" + %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // +util.initializer { + %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + %__device_0 = util.global.load @__device_0 : !hal.device + %c-1_i64 = arith.constant -1 : i64 + %c-1 = arith.constant -1 : index + %c0 = arith.constant 0 : index + %c14_i32 = arith.constant 14 : i32 + %0 = util.null : !hal.executable + %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index + %2 = arith.cmpi eq, %1, %c0 : index + cf.cond_br %2, ^bb1, ^bb2 +^bb1: // pred: ^bb0 + %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable + cf.br ^bb3(%executable : !hal.executable) +^bb2: // pred: ^bb0 + util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + cf.br ^bb3(%0 : !hal.executable) +^bb3(%3: !hal.executable): // 2 preds: ^bb1, ^bb2 + util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.return +} + +// -----// IR Dump After Canonicalizer (canonicalize) //----- // +util.initializer { + %0 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer + util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.return +} + +// -----// IR Dump After CSE (cse) //----- // +util.initializer { + %0 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer + util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.return +} + +// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // +util.initializer { + %0 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer + util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.return +} + +// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + %c-1_i32 = arith.constant -1 : i32 + %c0_i64 = arith.constant 0 : i64 + %0 = util.null : !hal.fence + %c-1_i64 = arith.constant -1 : i64 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32 + %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32 + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer + %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator + hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer + hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128} + util.call @__multiple_results_memoize_lookup() : () -> () + %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%__multiple_results_memoize_result_0_device_0) bindings([ + (%buffer : !hal.buffer)[%c0, %c8], + (%buffer_0 : !hal.buffer)[%c0, %c8], + (%transient_buffer : !hal.buffer)[%c0, %c128] + ]) flags("None") + %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32 + util.status.check_ok %status, "failed to wait on timepoint" + %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After CSE (cse) //----- // +util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} { + %c-1_i64 = arith.constant -1 : i64 + %c3 = arith.constant 3 : index + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c128 = arith.constant 128 : index + %c64_i32 = arith.constant 64 : i32 + %c1 = arith.constant 1 : index + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([ + (%c0 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([ + (%c1 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") + hal.command_buffer.finalize<%cmd : !hal.command_buffer> + util.return %cmd : !hal.command_buffer +} + +// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // +util.func private @__multiple_results_memoize_lookup() { + util.return +} + +// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // +util.initializer { + %0 = util.null : !hal.executable + %c14_i32 = arith.constant 14 : i32 + %c0 = arith.constant 0 : index + %c-1 = arith.constant -1 : index + %c-1_i64 = arith.constant -1 : i64 + %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + %__device_0 = util.global.load @__device_0 : !hal.device + %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index + %2 = arith.cmpi eq, %1, %c0 : index + cf.cond_br %2, ^bb1, ^bb2 +^bb1: // pred: ^bb0 + %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable + cf.br ^bb3(%executable : !hal.executable) +^bb2: // pred: ^bb0 + util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + cf.br ^bb3(%0 : !hal.executable) +^bb3(%3: !hal.executable): // 2 preds: ^bb1, ^bb2 + util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.return +} + +// -----// IR Dump After CSE (cse) //----- // +util.initializer { + %c18_i32 = arith.constant 18 : i32 + %false = arith.constant false + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %0 = util.null : !hal.device + %device_count = hal.devices.count : index + cf.br ^bb1(%c0, %c0, %0 : index, index, !hal.device) +^bb1(%1: index, %2: index, %3: !hal.device): // 2 preds: ^bb0, ^bb4 + %4 = util.cmp.eq %3, %0 : !hal.device + %5 = arith.cmpi slt, %1, %device_count : index + %6 = arith.andi %4, %5 : i1 + cf.cond_br %6, ^bb2, ^bb5 +^bb2: // pred: ^bb1 + %device_n = hal.devices.get %1 : !hal.device + %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false + cf.cond_br %value, ^bb3, ^bb4(%false : i1) +^bb3: // pred: ^bb2 + %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + cf.br ^bb4(%value_1 : i1) +^bb4(%7: i1): // 2 preds: ^bb2, ^bb3 + %8 = arith.cmpi eq, %2, %c0 : index + %9 = arith.select %7, %c1, %c0 : index + %10 = arith.addi %2, %9 : index + %11 = arith.andi %7, %8 : i1 + %12 = arith.select %11, %device_n, %0 : !hal.device + %13 = arith.addi %1, %c1 : index + cf.br ^bb1(%13, %10, %12 : index, index, !hal.device) +^bb5: // pred: ^bb1 + cf.cond_br %4, ^bb6, ^bb7 +^bb6: // pred: ^bb5 + util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>" + cf.br ^bb7 +^bb7: // 2 preds: ^bb5, ^bb6 + util.global.store %3, @__device_0 : !hal.device + util.return +} + +// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // +util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} { + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + %c-1_i64 = arith.constant -1 : i64 + %c3 = arith.constant 3 : index + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c128 = arith.constant 128 : index + %c64_i32 = arith.constant 64 : i32 + %c1 = arith.constant 1 : index + %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([ + (%c0 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([ + (%c1 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") + hal.command_buffer.finalize<%cmd : !hal.command_buffer> + util.return %cmd : !hal.command_buffer +} + +// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // +util.initializer { + %0 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer + util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.return +} + +// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // +util.initializer { + %c18_i32 = arith.constant 18 : i32 + %false = arith.constant false + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %0 = util.null : !hal.device + %device_count = hal.devices.count : index + cf.br ^bb1(%c0, %c0, %0 : index, index, !hal.device) +^bb1(%1: index, %2: index, %3: !hal.device): // 2 preds: ^bb0, ^bb4 + %4 = util.cmp.eq %3, %0 : !hal.device + %5 = arith.cmpi slt, %1, %device_count : index + %6 = arith.andi %4, %5 : i1 + cf.cond_br %6, ^bb2, ^bb5 +^bb2: // pred: ^bb1 + %device_n = hal.devices.get %1 : !hal.device + %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false + cf.cond_br %value, ^bb3, ^bb4(%false : i1) +^bb3: // pred: ^bb2 + %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + cf.br ^bb4(%value_1 : i1) +^bb4(%7: i1): // 2 preds: ^bb2, ^bb3 + %8 = arith.cmpi eq, %2, %c0 : index + %9 = arith.select %7, %c1, %c0 : index + %10 = arith.addi %2, %9 : index + %11 = arith.andi %7, %8 : i1 + %12 = arith.select %11, %device_n, %0 : !hal.device + %13 = arith.addi %1, %c1 : index + cf.br ^bb1(%13, %10, %12 : index, index, !hal.device) +^bb5: // pred: ^bb1 + cf.cond_br %4, ^bb6, ^bb7 +^bb6: // pred: ^bb5 + util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>" + cf.br ^bb7 +^bb7: // 2 preds: ^bb5, ^bb6 + util.global.store %3, @__device_0 : !hal.device + util.return +} + +// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32 + %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32 + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c64 = arith.constant 64 : index + %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index + %c-1_i64 = arith.constant -1 : i64 + %0 = util.null : !hal.fence + %c0_i64 = arith.constant 0 : i64 + %c-1_i32 = arith.constant -1 : i32 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer + %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator + hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer + hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128} + util.call @__multiple_results_memoize_lookup() : () -> () + %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%__multiple_results_memoize_result_0_device_0) bindings([ + (%buffer : !hal.buffer)[%c0, %c8], + (%buffer_0 : !hal.buffer)[%c0, %c8], + (%transient_buffer : !hal.buffer)[%c0, %c128] + ]) flags("None") + %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32 + util.status.check_ok %status, "failed to wait on timepoint" + %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // +util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} { + %c1 = arith.constant 1 : index + %c64_i32 = arith.constant 64 : i32 + %c128 = arith.constant 128 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %c0 = arith.constant 0 : index + %c0_i32 = arith.constant 0 : i32 + %c3 = arith.constant 3 : index + %c-1_i64 = arith.constant -1 : i64 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([ + (%c0 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([ + (%c1 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") + hal.command_buffer.finalize<%cmd : !hal.command_buffer> + util.return %cmd : !hal.command_buffer +} + +// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // +util.initializer { + %c18_i32 = arith.constant 18 : i32 + %false = arith.constant false + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %0 = util.null : !hal.device + %device_count = hal.devices.count : index + cf.br ^bb1(%c0, %c0, %0 : index, index, !hal.device) +^bb1(%1: index, %2: index, %3: !hal.device): // 2 preds: ^bb0, ^bb4 + %4 = util.cmp.eq %3, %0 : !hal.device + %5 = arith.cmpi slt, %1, %device_count : index + %6 = arith.andi %4, %5 : i1 + cf.cond_br %6, ^bb2, ^bb5 +^bb2: // pred: ^bb1 + %device_n = hal.devices.get %1 : !hal.device + %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false + cf.cond_br %value, ^bb3, ^bb4(%false : i1) +^bb3: // pred: ^bb2 + %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + cf.br ^bb4(%value_1 : i1) +^bb4(%7: i1): // 2 preds: ^bb2, ^bb3 + %8 = arith.cmpi eq, %2, %c0 : index + %9 = arith.select %7, %c1, %c0 : index + %10 = arith.addi %2, %9 : index + %11 = arith.andi %7, %8 : i1 + %12 = arith.select %11, %device_n, %0 : !hal.device + %13 = arith.addi %1, %c1 : index + cf.br ^bb1(%13, %10, %12 : index, index, !hal.device) +^bb5: // pred: ^bb1 + cf.cond_br %4, ^bb6, ^bb7 +^bb6: // pred: ^bb5 + util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>" + cf.br ^bb7 +^bb7: // 2 preds: ^bb5, ^bb6 + util.global.store %3, @__device_0 : !hal.device + util.return +} + +// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // +util.initializer { + %__device_0 = util.global.load @__device_0 : !hal.device + %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + util.return +} + +// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // +module attributes {iree.fixedpoint.iteration = 2 : index} { + util.global private @__device_0 : !hal.device + util.initializer { + %c18_i32 = arith.constant 18 : i32 + %false = arith.constant false + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %0 = util.null : !hal.device + %device_count = hal.devices.count : index + cf.br ^bb1(%c0, %c0, %0 : index, index, !hal.device) + ^bb1(%1: index, %2: index, %3: !hal.device): // 2 preds: ^bb0, ^bb4 + %4 = util.cmp.eq %3, %0 : !hal.device + %5 = arith.cmpi slt, %1, %device_count : index + %6 = arith.andi %4, %5 : i1 + cf.cond_br %6, ^bb2, ^bb5 + ^bb2: // pred: ^bb1 + %device_n = hal.devices.get %1 : !hal.device + %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false + cf.cond_br %value, ^bb3, ^bb4(%false : i1) + ^bb3: // pred: ^bb2 + %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + cf.br ^bb4(%value_1 : i1) + ^bb4(%7: i1): // 2 preds: ^bb2, ^bb3 + %8 = arith.cmpi eq, %2, %c0 : index + %9 = arith.select %7, %c1, %c0 : index + %10 = arith.addi %2, %9 : index + %11 = arith.andi %7, %8 : i1 + %12 = arith.select %11, %device_n, %0 : !hal.device + %13 = arith.addi %1, %c1 : index + cf.br ^bb1(%13, %10, %12 : index, index, !hal.device) + ^bb5: // pred: ^bb1 + cf.cond_br %4, ^bb6, ^bb7 + ^bb6: // pred: ^bb5 + util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>" + cf.br ^bb7 + ^bb7: // 2 preds: ^bb5, ^bb6 + util.global.store %3, @__device_0 : !hal.device + util.return + } + util.global private @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + util.initializer { + %__device_0 = util.global.load @__device_0 : !hal.device + %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + util.return + } + util.global private @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.initializer { + %0 = util.null : !hal.executable + %c14_i32 = arith.constant 14 : i32 + %c0 = arith.constant 0 : index + %c-1 = arith.constant -1 : index + %c-1_i64 = arith.constant -1 : i64 + %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + %__device_0 = util.global.load @__device_0 : !hal.device + %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index + %2 = arith.cmpi eq, %1, %c0 : index + cf.cond_br %2, ^bb1, ^bb2 + ^bb1: // pred: ^bb0 + %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable + cf.br ^bb3(%executable : !hal.executable) + ^bb2: // pred: ^bb0 + util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + cf.br ^bb3(%0 : !hal.executable) + ^bb3(%3: !hal.executable): // 2 preds: ^bb1, ^bb2 + util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.return + } + hal.executable private @multiple_results_dispatch_0 { + hal.executable.binary public @embedded_elf_arm_64 attributes {data = dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8>, format = "embedded-elf-arm_64", mime_type = "application/x-elf"} + } + util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} { + %c1 = arith.constant 1 : index + %c64_i32 = arith.constant 64 : i32 + %c128 = arith.constant 128 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %c0 = arith.constant 0 : index + %c0_i32 = arith.constant 0 : i32 + %c3 = arith.constant 3 : index + %c-1_i64 = arith.constant -1 : i64 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([ + (%c0 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([ + (%c1 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") + hal.command_buffer.finalize<%cmd : !hal.command_buffer> + util.return %cmd : !hal.command_buffer + } + util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.initializer { + %0 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer + util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.return + } + util.func private @__multiple_results_memoize_lookup() { + util.return + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32 + %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32 + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c64 = arith.constant 64 : index + %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index + %c-1_i64 = arith.constant -1 : i64 + %0 = util.null : !hal.fence + %c0_i64 = arith.constant 0 : i64 + %c-1_i32 = arith.constant -1 : i32 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer + %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator + hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer + hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128} + util.call @__multiple_results_memoize_lookup() : () -> () + %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%__multiple_results_memoize_result_0_device_0) bindings([ + (%buffer : !hal.buffer)[%c0, %c8], + (%buffer_0 : !hal.buffer)[%c0, %c8], + (%transient_buffer : !hal.buffer)[%c0, %c128] + ]) flags("None") + %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32 + util.status.check_ok %status, "failed to wait on timepoint" + %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- // +module attributes {iree.fixedpoint.iteration = 2 : index} { + util.global private @__device_0 : !hal.device + util.initializer { + %c18_i32 = arith.constant 18 : i32 + %false = arith.constant false + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %0 = util.null : !hal.device + %device_count = hal.devices.count : index + cf.br ^bb1(%c0, %c0, %0 : index, index, !hal.device) + ^bb1(%1: index, %2: index, %3: !hal.device): // 2 preds: ^bb0, ^bb4 + %4 = util.cmp.eq %3, %0 : !hal.device + %5 = arith.cmpi slt, %1, %device_count : index + %6 = arith.andi %4, %5 : i1 + cf.cond_br %6, ^bb2, ^bb5 + ^bb2: // pred: ^bb1 + %device_n = hal.devices.get %1 : !hal.device + %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false + cf.cond_br %value, ^bb3, ^bb4(%false : i1) + ^bb3: // pred: ^bb2 + %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + cf.br ^bb4(%value_1 : i1) + ^bb4(%7: i1): // 2 preds: ^bb2, ^bb3 + %8 = arith.cmpi eq, %2, %c0 : index + %9 = arith.select %7, %c1, %c0 : index + %10 = arith.addi %2, %9 : index + %11 = arith.andi %7, %8 : i1 + %12 = arith.select %11, %device_n, %0 : !hal.device + %13 = arith.addi %1, %c1 : index + cf.br ^bb1(%13, %10, %12 : index, index, !hal.device) + ^bb5: // pred: ^bb1 + cf.cond_br %4, ^bb6, ^bb7 + ^bb6: // pred: ^bb5 + util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>" + cf.br ^bb7 + ^bb7: // 2 preds: ^bb5, ^bb6 + util.global.store %3, @__device_0 : !hal.device + util.return + } + util.global private @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + util.initializer { + %__device_0 = util.global.load @__device_0 : !hal.device + %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + util.return + } + util.global private @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.initializer { + %0 = util.null : !hal.executable + %c14_i32 = arith.constant 14 : i32 + %c0 = arith.constant 0 : index + %c-1 = arith.constant -1 : index + %c-1_i64 = arith.constant -1 : i64 + %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + %__device_0 = util.global.load @__device_0 : !hal.device + %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index + %2 = arith.cmpi eq, %1, %c0 : index + cf.cond_br %2, ^bb1, ^bb2 + ^bb1: // pred: ^bb0 + %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable + cf.br ^bb3(%executable : !hal.executable) + ^bb2: // pred: ^bb0 + util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + cf.br ^bb3(%0 : !hal.executable) + ^bb3(%3: !hal.executable): // 2 preds: ^bb1, ^bb2 + util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.return + } + hal.executable private @multiple_results_dispatch_0 { + hal.executable.binary public @embedded_elf_arm_64 attributes {data = dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8>, format = "embedded-elf-arm_64", mime_type = "application/x-elf"} + } + util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} { + %c1 = arith.constant 1 : index + %c64_i32 = arith.constant 64 : i32 + %c128 = arith.constant 128 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %c0 = arith.constant 0 : index + %c0_i32 = arith.constant 0 : i32 + %c3 = arith.constant 3 : index + %c-1_i64 = arith.constant -1 : i64 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([ + (%c0 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([ + (%c1 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") + hal.command_buffer.finalize<%cmd : !hal.command_buffer> + util.return %cmd : !hal.command_buffer + } + util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.initializer { + %0 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer + util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.return + } + util.func private @__multiple_results_memoize_lookup() { + util.return + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32 + %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32 + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c64 = arith.constant 64 : index + %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index + %c-1_i64 = arith.constant -1 : i64 + %0 = util.null : !hal.fence + %c0_i64 = arith.constant 0 : i64 + %c-1_i32 = arith.constant -1 : i32 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer + %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator + hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer + hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128} + util.call @__multiple_results_memoize_lookup() : () -> () + %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%__multiple_results_memoize_result_0_device_0) bindings([ + (%buffer : !hal.buffer)[%c0, %c8], + (%buffer_0 : !hal.buffer)[%c0, %c8], + (%transient_buffer : !hal.buffer)[%c0, %c128] + ]) flags("None") + %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32 + util.status.check_ok %status, "failed to wait on timepoint" + %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After IPOPass (iree-util-ipo) //----- // +module attributes {iree.fixedpoint.iteration = 2 : index, iree.fixedpoint.modified} { + util.global private @__device_0 : !hal.device + util.initializer { + %c18_i32 = arith.constant 18 : i32 + %false = arith.constant false + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %0 = util.null : !hal.device + %device_count = hal.devices.count : index + cf.br ^bb1(%c0, %c0, %0 : index, index, !hal.device) + ^bb1(%1: index, %2: index, %3: !hal.device): // 2 preds: ^bb0, ^bb4 + %4 = util.cmp.eq %3, %0 : !hal.device + %5 = arith.cmpi slt, %1, %device_count : index + %6 = arith.andi %4, %5 : i1 + cf.cond_br %6, ^bb2, ^bb5 + ^bb2: // pred: ^bb1 + %device_n = hal.devices.get %1 : !hal.device + %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false + cf.cond_br %value, ^bb3, ^bb4(%false : i1) + ^bb3: // pred: ^bb2 + %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + cf.br ^bb4(%value_1 : i1) + ^bb4(%7: i1): // 2 preds: ^bb2, ^bb3 + %8 = arith.cmpi eq, %2, %c0 : index + %9 = arith.select %7, %c1, %c0 : index + %10 = arith.addi %2, %9 : index + %11 = arith.andi %7, %8 : i1 + %12 = arith.select %11, %device_n, %0 : !hal.device + %13 = arith.addi %1, %c1 : index + cf.br ^bb1(%13, %10, %12 : index, index, !hal.device) + ^bb5: // pred: ^bb1 + cf.cond_br %4, ^bb6, ^bb7 + ^bb6: // pred: ^bb5 + util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>" + cf.br ^bb7 + ^bb7: // 2 preds: ^bb5, ^bb6 + util.global.store %3, @__device_0 : !hal.device + util.return + } + util.global private @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + util.initializer { + %__device_0 = util.global.load @__device_0 : !hal.device + %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + util.return + } + util.global private @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.initializer { + %0 = util.null : !hal.executable + %c14_i32 = arith.constant 14 : i32 + %c0 = arith.constant 0 : index + %c-1 = arith.constant -1 : index + %c-1_i64 = arith.constant -1 : i64 + %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + %__device_0 = util.global.load @__device_0 : !hal.device + %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index + %2 = arith.cmpi eq, %1, %c0 : index + cf.cond_br %2, ^bb1, ^bb2 + ^bb1: // pred: ^bb0 + %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable + cf.br ^bb3(%executable : !hal.executable) + ^bb2: // pred: ^bb0 + util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + cf.br ^bb3(%0 : !hal.executable) + ^bb3(%3: !hal.executable): // 2 preds: ^bb1, ^bb2 + util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.return + } + hal.executable private @multiple_results_dispatch_0 { + hal.executable.binary public @embedded_elf_arm_64 attributes {data = dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8>, format = "embedded-elf-arm_64", mime_type = "application/x-elf"} + } + util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} { + %c1 = arith.constant 1 : index + %c64_i32 = arith.constant 64 : i32 + %c128 = arith.constant 128 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %c0 = arith.constant 0 : index + %c0_i32 = arith.constant 0 : i32 + %c3 = arith.constant 3 : index + %c-1_i64 = arith.constant -1 : i64 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([ + (%c0 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([ + (%c1 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") + hal.command_buffer.finalize<%cmd : !hal.command_buffer> + util.return %cmd : !hal.command_buffer + } + util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.initializer { + %0 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer + util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.return + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32 + %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32 + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c64 = arith.constant 64 : index + %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index + %c-1_i64 = arith.constant -1 : i64 + %0 = util.null : !hal.fence + %c0_i64 = arith.constant 0 : i64 + %c-1_i32 = arith.constant -1 : i32 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer + %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator + hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer + hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128} + %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%__multiple_results_memoize_result_0_device_0) bindings([ + (%buffer : !hal.buffer)[%c0, %c8], + (%buffer_0 : !hal.buffer)[%c0, %c8], + (%transient_buffer : !hal.buffer)[%c0, %c128] + ]) flags("None") + %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32 + util.status.check_ok %status, "failed to wait on timepoint" + %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After Canonicalizer (canonicalize) //----- // +util.initializer { + %__device_0 = util.global.load @__device_0 : !hal.device + %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + util.return +} + +// -----// IR Dump After CSE (cse) //----- // +util.initializer { + %__device_0 = util.global.load @__device_0 : !hal.device + %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + util.return +} + +// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // +util.initializer { + %__device_0 = util.global.load @__device_0 : !hal.device + %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + util.return +} + +// -----// IR Dump After Canonicalizer (canonicalize) //----- // +util.initializer { + %0 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer + util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.return +} + +// -----// IR Dump After CSE (cse) //----- // +util.initializer { + %0 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer + util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.return +} + +// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // +util.initializer { + %0 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer + util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.return +} + +// -----// IR Dump After Canonicalizer (canonicalize) //----- // +util.initializer { + %0 = util.null : !hal.executable + %c14_i32 = arith.constant 14 : i32 + %c0 = arith.constant 0 : index + %c-1 = arith.constant -1 : index + %c-1_i64 = arith.constant -1 : i64 + %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + %__device_0 = util.global.load @__device_0 : !hal.device + %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index + %2 = arith.cmpi eq, %1, %c0 : index + cf.cond_br %2, ^bb1, ^bb2 +^bb1: // pred: ^bb0 + %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable + cf.br ^bb3(%executable : !hal.executable) +^bb2: // pred: ^bb0 + util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + cf.br ^bb3(%0 : !hal.executable) +^bb3(%3: !hal.executable): // 2 preds: ^bb1, ^bb2 + util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.return +} + +// -----// IR Dump After Canonicalizer (canonicalize) //----- // +util.initializer { + %c18_i32 = arith.constant 18 : i32 + %false = arith.constant false + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %0 = util.null : !hal.device + %device_count = hal.devices.count : index + cf.br ^bb1(%c0, %c0, %0 : index, index, !hal.device) +^bb1(%1: index, %2: index, %3: !hal.device): // 2 preds: ^bb0, ^bb4 + %4 = util.cmp.eq %3, %0 : !hal.device + %5 = arith.cmpi slt, %1, %device_count : index + %6 = arith.andi %4, %5 : i1 + cf.cond_br %6, ^bb2, ^bb5 +^bb2: // pred: ^bb1 + %device_n = hal.devices.get %1 : !hal.device + %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false + cf.cond_br %value, ^bb3, ^bb4(%false : i1) +^bb3: // pred: ^bb2 + %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + cf.br ^bb4(%value_1 : i1) +^bb4(%7: i1): // 2 preds: ^bb2, ^bb3 + %8 = arith.cmpi eq, %2, %c0 : index + %9 = arith.select %7, %c1, %c0 : index + %10 = arith.addi %2, %9 : index + %11 = arith.andi %7, %8 : i1 + %12 = arith.select %11, %device_n, %0 : !hal.device + %13 = arith.addi %1, %c1 : index + cf.br ^bb1(%13, %10, %12 : index, index, !hal.device) +^bb5: // pred: ^bb1 + cf.cond_br %4, ^bb6, ^bb7 +^bb6: // pred: ^bb5 + util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>" + cf.br ^bb7 +^bb7: // 2 preds: ^bb5, ^bb6 + util.global.store %3, @__device_0 : !hal.device + util.return +} + +// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // +util.initializer { + %0 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer + util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.return +} + +// -----// IR Dump After Canonicalizer (canonicalize) //----- // +util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} { + %c1 = arith.constant 1 : index + %c64_i32 = arith.constant 64 : i32 + %c128 = arith.constant 128 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %c0 = arith.constant 0 : index + %c0_i32 = arith.constant 0 : i32 + %c3 = arith.constant 3 : index + %c-1_i64 = arith.constant -1 : i64 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([ + (%c0 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([ + (%c1 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") + hal.command_buffer.finalize<%cmd : !hal.command_buffer> + util.return %cmd : !hal.command_buffer +} + +// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // +util.initializer { + %__device_0 = util.global.load @__device_0 : !hal.device + %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + util.return +} + +// -----// IR Dump After CSE (cse) //----- // +util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} { + %c1 = arith.constant 1 : index + %c64_i32 = arith.constant 64 : i32 + %c128 = arith.constant 128 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %c0 = arith.constant 0 : index + %c0_i32 = arith.constant 0 : i32 + %c3 = arith.constant 3 : index + %c-1_i64 = arith.constant -1 : i64 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([ + (%c0 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([ + (%c1 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") + hal.command_buffer.finalize<%cmd : !hal.command_buffer> + util.return %cmd : !hal.command_buffer +} + +// -----// IR Dump After Canonicalizer (canonicalize) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32 + %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32 + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c64 = arith.constant 64 : index + %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index + %c-1_i64 = arith.constant -1 : i64 + %0 = util.null : !hal.fence + %c0_i64 = arith.constant 0 : i64 + %c-1_i32 = arith.constant -1 : i32 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer + %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator + hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer + hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128} + %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%__multiple_results_memoize_result_0_device_0) bindings([ + (%buffer : !hal.buffer)[%c0, %c8], + (%buffer_0 : !hal.buffer)[%c0, %c8], + (%transient_buffer : !hal.buffer)[%c0, %c128] + ]) flags("None") + %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32 + util.status.check_ok %status, "failed to wait on timepoint" + %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After CSE (cse) //----- // +util.initializer { + %c18_i32 = arith.constant 18 : i32 + %false = arith.constant false + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %0 = util.null : !hal.device + %device_count = hal.devices.count : index + cf.br ^bb1(%c0, %c0, %0 : index, index, !hal.device) +^bb1(%1: index, %2: index, %3: !hal.device): // 2 preds: ^bb0, ^bb4 + %4 = util.cmp.eq %3, %0 : !hal.device + %5 = arith.cmpi slt, %1, %device_count : index + %6 = arith.andi %4, %5 : i1 + cf.cond_br %6, ^bb2, ^bb5 +^bb2: // pred: ^bb1 + %device_n = hal.devices.get %1 : !hal.device + %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false + cf.cond_br %value, ^bb3, ^bb4(%false : i1) +^bb3: // pred: ^bb2 + %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + cf.br ^bb4(%value_1 : i1) +^bb4(%7: i1): // 2 preds: ^bb2, ^bb3 + %8 = arith.cmpi eq, %2, %c0 : index + %9 = arith.select %7, %c1, %c0 : index + %10 = arith.addi %2, %9 : index + %11 = arith.andi %7, %8 : i1 + %12 = arith.select %11, %device_n, %0 : !hal.device + %13 = arith.addi %1, %c1 : index + cf.br ^bb1(%13, %10, %12 : index, index, !hal.device) +^bb5: // pred: ^bb1 + cf.cond_br %4, ^bb6, ^bb7 +^bb6: // pred: ^bb5 + util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>" + cf.br ^bb7 +^bb7: // 2 preds: ^bb5, ^bb6 + util.global.store %3, @__device_0 : !hal.device + util.return +} + +// -----// IR Dump After CSE (cse) //----- // +util.initializer { + %0 = util.null : !hal.executable + %c14_i32 = arith.constant 14 : i32 + %c0 = arith.constant 0 : index + %c-1 = arith.constant -1 : index + %c-1_i64 = arith.constant -1 : i64 + %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + %__device_0 = util.global.load @__device_0 : !hal.device + %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index + %2 = arith.cmpi eq, %1, %c0 : index + cf.cond_br %2, ^bb1, ^bb2 +^bb1: // pred: ^bb0 + %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable + cf.br ^bb3(%executable : !hal.executable) +^bb2: // pred: ^bb0 + util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + cf.br ^bb3(%0 : !hal.executable) +^bb3(%3: !hal.executable): // 2 preds: ^bb1, ^bb2 + util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.return +} + +// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // +util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} { + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + %c1 = arith.constant 1 : index + %c64_i32 = arith.constant 64 : i32 + %c128 = arith.constant 128 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %c0 = arith.constant 0 : index + %c0_i32 = arith.constant 0 : i32 + %c3 = arith.constant 3 : index + %c-1_i64 = arith.constant -1 : i64 + %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([ + (%c0 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([ + (%c1 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") + hal.command_buffer.finalize<%cmd : !hal.command_buffer> + util.return %cmd : !hal.command_buffer +} + +// -----// IR Dump After CSE (cse) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32 + %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32 + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c64 = arith.constant 64 : index + %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index + %c-1_i64 = arith.constant -1 : i64 + %0 = util.null : !hal.fence + %c0_i64 = arith.constant 0 : i64 + %c-1_i32 = arith.constant -1 : i32 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer + %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator + hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer + hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128} + %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%__multiple_results_memoize_result_0_device_0) bindings([ + (%buffer : !hal.buffer)[%c0, %c8], + (%buffer_0 : !hal.buffer)[%c0, %c8], + (%transient_buffer : !hal.buffer)[%c0, %c128] + ]) flags("None") + %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32 + util.status.check_ok %status, "failed to wait on timepoint" + %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // +util.initializer { + %c18_i32 = arith.constant 18 : i32 + %false = arith.constant false + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %0 = util.null : !hal.device + %device_count = hal.devices.count : index + cf.br ^bb1(%c0, %c0, %0 : index, index, !hal.device) +^bb1(%1: index, %2: index, %3: !hal.device): // 2 preds: ^bb0, ^bb4 + %4 = util.cmp.eq %3, %0 : !hal.device + %5 = arith.cmpi slt, %1, %device_count : index + %6 = arith.andi %4, %5 : i1 + cf.cond_br %6, ^bb2, ^bb5 +^bb2: // pred: ^bb1 + %device_n = hal.devices.get %1 : !hal.device + %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false + cf.cond_br %value, ^bb3, ^bb4(%false : i1) +^bb3: // pred: ^bb2 + %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + cf.br ^bb4(%value_1 : i1) +^bb4(%7: i1): // 2 preds: ^bb2, ^bb3 + %8 = arith.cmpi eq, %2, %c0 : index + %9 = arith.select %7, %c1, %c0 : index + %10 = arith.addi %2, %9 : index + %11 = arith.andi %7, %8 : i1 + %12 = arith.select %11, %device_n, %0 : !hal.device + %13 = arith.addi %1, %c1 : index + cf.br ^bb1(%13, %10, %12 : index, index, !hal.device) +^bb5: // pred: ^bb1 + cf.cond_br %4, ^bb6, ^bb7 +^bb6: // pred: ^bb5 + util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>" + cf.br ^bb7 +^bb7: // 2 preds: ^bb5, ^bb6 + util.global.store %3, @__device_0 : !hal.device + util.return +} + +// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // +util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} { + %c-1_i64 = arith.constant -1 : i64 + %c3 = arith.constant 3 : index + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c128 = arith.constant 128 : index + %c64_i32 = arith.constant 64 : i32 + %c1 = arith.constant 1 : index + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([ + (%c0 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([ + (%c1 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") + hal.command_buffer.finalize<%cmd : !hal.command_buffer> + util.return %cmd : !hal.command_buffer +} + +// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32 + %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32 + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c64 = arith.constant 64 : index + %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index + %c-1_i64 = arith.constant -1 : i64 + %0 = util.null : !hal.fence + %c0_i64 = arith.constant 0 : i64 + %c-1_i32 = arith.constant -1 : i32 + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer + %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator + hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer + hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128} + %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%__multiple_results_memoize_result_0_device_0) bindings([ + (%buffer : !hal.buffer)[%c0, %c8], + (%buffer_0 : !hal.buffer)[%c0, %c8], + (%transient_buffer : !hal.buffer)[%c0, %c128] + ]) flags("None") + %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32 + util.status.check_ok %status, "failed to wait on timepoint" + %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // +util.initializer { + %c18_i32 = arith.constant 18 : i32 + %false = arith.constant false + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %0 = util.null : !hal.device + %device_count = hal.devices.count : index + cf.br ^bb1(%c0, %c0, %0 : index, index, !hal.device) +^bb1(%1: index, %2: index, %3: !hal.device): // 2 preds: ^bb0, ^bb4 + %4 = util.cmp.eq %3, %0 : !hal.device + %5 = arith.cmpi slt, %1, %device_count : index + %6 = arith.andi %4, %5 : i1 + cf.cond_br %6, ^bb2, ^bb5 +^bb2: // pred: ^bb1 + %device_n = hal.devices.get %1 : !hal.device + %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false + cf.cond_br %value, ^bb3, ^bb4(%false : i1) +^bb3: // pred: ^bb2 + %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + cf.br ^bb4(%value_1 : i1) +^bb4(%7: i1): // 2 preds: ^bb2, ^bb3 + %8 = arith.cmpi eq, %2, %c0 : index + %9 = arith.select %7, %c1, %c0 : index + %10 = arith.addi %2, %9 : index + %11 = arith.andi %7, %8 : i1 + %12 = arith.select %11, %device_n, %0 : !hal.device + %13 = arith.addi %1, %c1 : index + cf.br ^bb1(%13, %10, %12 : index, index, !hal.device) +^bb5: // pred: ^bb1 + cf.cond_br %4, ^bb6, ^bb7 +^bb6: // pred: ^bb5 + util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>" + cf.br ^bb7 +^bb7: // 2 preds: ^bb5, ^bb6 + util.global.store %3, @__device_0 : !hal.device + util.return +} + +// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // +util.initializer { + %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + %__device_0 = util.global.load @__device_0 : !hal.device + %0 = util.null : !hal.executable + %c14_i32 = arith.constant 14 : i32 + %c0 = arith.constant 0 : index + %c-1 = arith.constant -1 : index + %c-1_i64 = arith.constant -1 : i64 + %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index + %2 = arith.cmpi eq, %1, %c0 : index + cf.cond_br %2, ^bb1, ^bb2 +^bb1: // pred: ^bb0 + %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable + cf.br ^bb3(%executable : !hal.executable) +^bb2: // pred: ^bb0 + util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + cf.br ^bb3(%0 : !hal.executable) +^bb3(%3: !hal.executable): // 2 preds: ^bb1, ^bb2 + util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.return +} + +// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c-1_i32 = arith.constant -1 : i32 + %c0_i64 = arith.constant 0 : i64 + %0 = util.null : !hal.fence + %c-1_i64 = arith.constant -1 : i64 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32 + %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer + %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator + hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer + hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128} + %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%__multiple_results_memoize_result_0_device_0) bindings([ + (%buffer : !hal.buffer)[%c0, %c8], + (%buffer_0 : !hal.buffer)[%c0, %c8], + (%transient_buffer : !hal.buffer)[%c0, %c128] + ]) flags("None") + %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32 + util.status.check_ok %status, "failed to wait on timepoint" + %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // +util.initializer { + %c-1_i64 = arith.constant -1 : i64 + %c-1 = arith.constant -1 : index + %c0 = arith.constant 0 : index + %c14_i32 = arith.constant 14 : i32 + %0 = util.null : !hal.executable + %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + %__device_0 = util.global.load @__device_0 : !hal.device + %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index + %2 = arith.cmpi eq, %1, %c0 : index + cf.cond_br %2, ^bb1, ^bb2 +^bb1: // pred: ^bb0 + %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable + cf.br ^bb3(%executable : !hal.executable) +^bb2: // pred: ^bb0 + util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + cf.br ^bb3(%0 : !hal.executable) +^bb3(%3: !hal.executable): // 2 preds: ^bb1, ^bb2 + util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.return +} + +// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // +module attributes {iree.fixedpoint.iteration = 3 : index} { + util.global private @__device_0 : !hal.device + util.initializer { + %c18_i32 = arith.constant 18 : i32 + %false = arith.constant false + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %0 = util.null : !hal.device + %device_count = hal.devices.count : index + cf.br ^bb1(%c0, %c0, %0 : index, index, !hal.device) + ^bb1(%1: index, %2: index, %3: !hal.device): // 2 preds: ^bb0, ^bb4 + %4 = util.cmp.eq %3, %0 : !hal.device + %5 = arith.cmpi slt, %1, %device_count : index + %6 = arith.andi %4, %5 : i1 + cf.cond_br %6, ^bb2, ^bb5 + ^bb2: // pred: ^bb1 + %device_n = hal.devices.get %1 : !hal.device + %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false + cf.cond_br %value, ^bb3, ^bb4(%false : i1) + ^bb3: // pred: ^bb2 + %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + cf.br ^bb4(%value_1 : i1) + ^bb4(%7: i1): // 2 preds: ^bb2, ^bb3 + %8 = arith.cmpi eq, %2, %c0 : index + %9 = arith.select %7, %c1, %c0 : index + %10 = arith.addi %2, %9 : index + %11 = arith.andi %7, %8 : i1 + %12 = arith.select %11, %device_n, %0 : !hal.device + %13 = arith.addi %1, %c1 : index + cf.br ^bb1(%13, %10, %12 : index, index, !hal.device) + ^bb5: // pred: ^bb1 + cf.cond_br %4, ^bb6, ^bb7 + ^bb6: // pred: ^bb5 + util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>" + cf.br ^bb7 + ^bb7: // 2 preds: ^bb5, ^bb6 + util.global.store %3, @__device_0 : !hal.device + util.return + } + util.global private @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + util.initializer { + %__device_0 = util.global.load @__device_0 : !hal.device + %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + util.return + } + util.global private @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.initializer { + %c-1_i64 = arith.constant -1 : i64 + %c-1 = arith.constant -1 : index + %c0 = arith.constant 0 : index + %c14_i32 = arith.constant 14 : i32 + %0 = util.null : !hal.executable + %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + %__device_0 = util.global.load @__device_0 : !hal.device + %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index + %2 = arith.cmpi eq, %1, %c0 : index + cf.cond_br %2, ^bb1, ^bb2 + ^bb1: // pred: ^bb0 + %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable + cf.br ^bb3(%executable : !hal.executable) + ^bb2: // pred: ^bb0 + util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + cf.br ^bb3(%0 : !hal.executable) + ^bb3(%3: !hal.executable): // 2 preds: ^bb1, ^bb2 + util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.return + } + hal.executable private @multiple_results_dispatch_0 { + hal.executable.binary public @embedded_elf_arm_64 attributes {data = dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8>, format = "embedded-elf-arm_64", mime_type = "application/x-elf"} + } + util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} { + %c-1_i64 = arith.constant -1 : i64 + %c3 = arith.constant 3 : index + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c128 = arith.constant 128 : index + %c64_i32 = arith.constant 64 : i32 + %c1 = arith.constant 1 : index + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([ + (%c0 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([ + (%c1 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") + hal.command_buffer.finalize<%cmd : !hal.command_buffer> + util.return %cmd : !hal.command_buffer + } + util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.initializer { + %0 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer + util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.return + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c-1_i32 = arith.constant -1 : i32 + %c0_i64 = arith.constant 0 : i64 + %0 = util.null : !hal.fence + %c-1_i64 = arith.constant -1 : i64 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32 + %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer + %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator + hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer + hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128} + %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%__multiple_results_memoize_result_0_device_0) bindings([ + (%buffer : !hal.buffer)[%c0, %c8], + (%buffer_0 : !hal.buffer)[%c0, %c8], + (%transient_buffer : !hal.buffer)[%c0, %c128] + ]) flags("None") + %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32 + util.status.check_ok %status, "failed to wait on timepoint" + %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- // +module attributes {iree.fixedpoint.iteration = 3 : index} { + util.global private @__device_0 : !hal.device + util.initializer { + %c18_i32 = arith.constant 18 : i32 + %false = arith.constant false + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %0 = util.null : !hal.device + %device_count = hal.devices.count : index + cf.br ^bb1(%c0, %c0, %0 : index, index, !hal.device) + ^bb1(%1: index, %2: index, %3: !hal.device): // 2 preds: ^bb0, ^bb4 + %4 = util.cmp.eq %3, %0 : !hal.device + %5 = arith.cmpi slt, %1, %device_count : index + %6 = arith.andi %4, %5 : i1 + cf.cond_br %6, ^bb2, ^bb5 + ^bb2: // pred: ^bb1 + %device_n = hal.devices.get %1 : !hal.device + %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false + cf.cond_br %value, ^bb3, ^bb4(%false : i1) + ^bb3: // pred: ^bb2 + %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + cf.br ^bb4(%value_1 : i1) + ^bb4(%7: i1): // 2 preds: ^bb2, ^bb3 + %8 = arith.cmpi eq, %2, %c0 : index + %9 = arith.select %7, %c1, %c0 : index + %10 = arith.addi %2, %9 : index + %11 = arith.andi %7, %8 : i1 + %12 = arith.select %11, %device_n, %0 : !hal.device + %13 = arith.addi %1, %c1 : index + cf.br ^bb1(%13, %10, %12 : index, index, !hal.device) + ^bb5: // pred: ^bb1 + cf.cond_br %4, ^bb6, ^bb7 + ^bb6: // pred: ^bb5 + util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>" + cf.br ^bb7 + ^bb7: // 2 preds: ^bb5, ^bb6 + util.global.store %3, @__device_0 : !hal.device + util.return + } + util.global private @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + util.initializer { + %__device_0 = util.global.load @__device_0 : !hal.device + %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + util.return + } + util.global private @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.initializer { + %c-1_i64 = arith.constant -1 : i64 + %c-1 = arith.constant -1 : index + %c0 = arith.constant 0 : index + %c14_i32 = arith.constant 14 : i32 + %0 = util.null : !hal.executable + %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + %__device_0 = util.global.load @__device_0 : !hal.device + %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index + %2 = arith.cmpi eq, %1, %c0 : index + cf.cond_br %2, ^bb1, ^bb2 + ^bb1: // pred: ^bb0 + %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable + cf.br ^bb3(%executable : !hal.executable) + ^bb2: // pred: ^bb0 + util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + cf.br ^bb3(%0 : !hal.executable) + ^bb3(%3: !hal.executable): // 2 preds: ^bb1, ^bb2 + util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.return + } + hal.executable private @multiple_results_dispatch_0 { + hal.executable.binary public @embedded_elf_arm_64 attributes {data = dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8>, format = "embedded-elf-arm_64", mime_type = "application/x-elf"} + } + util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} { + %c-1_i64 = arith.constant -1 : i64 + %c3 = arith.constant 3 : index + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c128 = arith.constant 128 : index + %c64_i32 = arith.constant 64 : i32 + %c1 = arith.constant 1 : index + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([ + (%c0 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([ + (%c1 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") + hal.command_buffer.finalize<%cmd : !hal.command_buffer> + util.return %cmd : !hal.command_buffer + } + util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.initializer { + %0 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer + util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.return + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c-1_i32 = arith.constant -1 : i32 + %c0_i64 = arith.constant 0 : i64 + %0 = util.null : !hal.fence + %c-1_i64 = arith.constant -1 : i64 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32 + %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer + %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator + hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer + hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128} + %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%__multiple_results_memoize_result_0_device_0) bindings([ + (%buffer : !hal.buffer)[%c0, %c8], + (%buffer_0 : !hal.buffer)[%c0, %c8], + (%transient_buffer : !hal.buffer)[%c0, %c128] + ]) flags("None") + %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32 + util.status.check_ok %status, "failed to wait on timepoint" + %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After IPOPass (iree-util-ipo) //----- // +module attributes {iree.fixedpoint.iteration = 3 : index} { + util.global private @__device_0 : !hal.device + util.initializer { + %c18_i32 = arith.constant 18 : i32 + %false = arith.constant false + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %0 = util.null : !hal.device + %device_count = hal.devices.count : index + cf.br ^bb1(%c0, %c0, %0 : index, index, !hal.device) + ^bb1(%1: index, %2: index, %3: !hal.device): // 2 preds: ^bb0, ^bb4 + %4 = util.cmp.eq %3, %0 : !hal.device + %5 = arith.cmpi slt, %1, %device_count : index + %6 = arith.andi %4, %5 : i1 + cf.cond_br %6, ^bb2, ^bb5 + ^bb2: // pred: ^bb1 + %device_n = hal.devices.get %1 : !hal.device + %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false + cf.cond_br %value, ^bb3, ^bb4(%false : i1) + ^bb3: // pred: ^bb2 + %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + cf.br ^bb4(%value_1 : i1) + ^bb4(%7: i1): // 2 preds: ^bb2, ^bb3 + %8 = arith.cmpi eq, %2, %c0 : index + %9 = arith.select %7, %c1, %c0 : index + %10 = arith.addi %2, %9 : index + %11 = arith.andi %7, %8 : i1 + %12 = arith.select %11, %device_n, %0 : !hal.device + %13 = arith.addi %1, %c1 : index + cf.br ^bb1(%13, %10, %12 : index, index, !hal.device) + ^bb5: // pred: ^bb1 + cf.cond_br %4, ^bb6, ^bb7 + ^bb6: // pred: ^bb5 + util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>" + cf.br ^bb7 + ^bb7: // 2 preds: ^bb5, ^bb6 + util.global.store %3, @__device_0 : !hal.device + util.return + } + util.global private @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + util.initializer { + %__device_0 = util.global.load @__device_0 : !hal.device + %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + util.return + } + util.global private @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.initializer { + %c-1_i64 = arith.constant -1 : i64 + %c-1 = arith.constant -1 : index + %c0 = arith.constant 0 : index + %c14_i32 = arith.constant 14 : i32 + %0 = util.null : !hal.executable + %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + %__device_0 = util.global.load @__device_0 : !hal.device + %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index + %2 = arith.cmpi eq, %1, %c0 : index + cf.cond_br %2, ^bb1, ^bb2 + ^bb1: // pred: ^bb0 + %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable + cf.br ^bb3(%executable : !hal.executable) + ^bb2: // pred: ^bb0 + util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + cf.br ^bb3(%0 : !hal.executable) + ^bb3(%3: !hal.executable): // 2 preds: ^bb1, ^bb2 + util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.return + } + hal.executable private @multiple_results_dispatch_0 { + hal.executable.binary public @embedded_elf_arm_64 attributes {data = dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8>, format = "embedded-elf-arm_64", mime_type = "application/x-elf"} + } + util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} { + %c-1_i64 = arith.constant -1 : i64 + %c3 = arith.constant 3 : index + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c128 = arith.constant 128 : index + %c64_i32 = arith.constant 64 : i32 + %c1 = arith.constant 1 : index + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([ + (%c0 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([ + (%c1 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") + hal.command_buffer.finalize<%cmd : !hal.command_buffer> + util.return %cmd : !hal.command_buffer + } + util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.initializer { + %0 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer + util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.return + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c-1_i32 = arith.constant -1 : i32 + %c0_i64 = arith.constant 0 : i64 + %0 = util.null : !hal.fence + %c-1_i64 = arith.constant -1 : i64 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32 + %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer + %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator + hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer + hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128} + %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%__multiple_results_memoize_result_0_device_0) bindings([ + (%buffer : !hal.buffer)[%c0, %c8], + (%buffer_0 : !hal.buffer)[%c0, %c8], + (%transient_buffer : !hal.buffer)[%c0, %c128] + ]) flags("None") + %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32 + util.status.check_ok %status, "failed to wait on timepoint" + %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After FixedPointIteratorPass (iree-util-fixed-point-iterator) //----- // +module { + util.global private @__device_0 : !hal.device + util.initializer { + %c18_i32 = arith.constant 18 : i32 + %false = arith.constant false + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %0 = util.null : !hal.device + %device_count = hal.devices.count : index + cf.br ^bb1(%c0, %c0, %0 : index, index, !hal.device) + ^bb1(%1: index, %2: index, %3: !hal.device): // 2 preds: ^bb0, ^bb4 + %4 = util.cmp.eq %3, %0 : !hal.device + %5 = arith.cmpi slt, %1, %device_count : index + %6 = arith.andi %4, %5 : i1 + cf.cond_br %6, ^bb2, ^bb5 + ^bb2: // pred: ^bb1 + %device_n = hal.devices.get %1 : !hal.device + %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false + cf.cond_br %value, ^bb3, ^bb4(%false : i1) + ^bb3: // pred: ^bb2 + %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + cf.br ^bb4(%value_1 : i1) + ^bb4(%7: i1): // 2 preds: ^bb2, ^bb3 + %8 = arith.cmpi eq, %2, %c0 : index + %9 = arith.select %7, %c1, %c0 : index + %10 = arith.addi %2, %9 : index + %11 = arith.andi %7, %8 : i1 + %12 = arith.select %11, %device_n, %0 : !hal.device + %13 = arith.addi %1, %c1 : index + cf.br ^bb1(%13, %10, %12 : index, index, !hal.device) + ^bb5: // pred: ^bb1 + cf.cond_br %4, ^bb6, ^bb7 + ^bb6: // pred: ^bb5 + util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>" + cf.br ^bb7 + ^bb7: // 2 preds: ^bb5, ^bb6 + util.global.store %3, @__device_0 : !hal.device + util.return + } + util.global private @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + util.initializer { + %__device_0 = util.global.load @__device_0 : !hal.device + %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + util.return + } + util.global private @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.initializer { + %c-1_i64 = arith.constant -1 : i64 + %c-1 = arith.constant -1 : index + %c0 = arith.constant 0 : index + %c14_i32 = arith.constant 14 : i32 + %0 = util.null : !hal.executable + %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + %__device_0 = util.global.load @__device_0 : !hal.device + %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index + %2 = arith.cmpi eq, %1, %c0 : index + cf.cond_br %2, ^bb1, ^bb2 + ^bb1: // pred: ^bb0 + %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable + cf.br ^bb3(%executable : !hal.executable) + ^bb2: // pred: ^bb0 + util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + cf.br ^bb3(%0 : !hal.executable) + ^bb3(%3: !hal.executable): // 2 preds: ^bb1, ^bb2 + util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.return + } + hal.executable private @multiple_results_dispatch_0 { + hal.executable.binary public @embedded_elf_arm_64 attributes {data = dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8>, format = "embedded-elf-arm_64", mime_type = "application/x-elf"} + } + util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} { + %c-1_i64 = arith.constant -1 : i64 + %c3 = arith.constant 3 : index + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c128 = arith.constant 128 : index + %c64_i32 = arith.constant 64 : i32 + %c1 = arith.constant 1 : index + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([ + (%c0 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([ + (%c1 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") + hal.command_buffer.finalize<%cmd : !hal.command_buffer> + util.return %cmd : !hal.command_buffer + } + util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.initializer { + %0 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer + util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.return + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c-1_i32 = arith.constant -1 : i32 + %c0_i64 = arith.constant 0 : i64 + %0 = util.null : !hal.fence + %c-1_i64 = arith.constant -1 : i64 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32 + %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer + %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator + hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer + hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128} + %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%__multiple_results_memoize_result_0_device_0) bindings([ + (%buffer : !hal.buffer)[%c0, %c8], + (%buffer_0 : !hal.buffer)[%c0, %c8], + (%transient_buffer : !hal.buffer)[%c0, %c128] + ]) flags("None") + %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32 + util.status.check_ok %status, "failed to wait on timepoint" + %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After Canonicalizer (canonicalize) //----- // +util.initializer { + %c18_i32 = arith.constant 18 : i32 + %false = arith.constant false + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %0 = util.null : !hal.device + %device_count = hal.devices.count : index + cf.br ^bb1(%c0, %c0, %0 : index, index, !hal.device) +^bb1(%1: index, %2: index, %3: !hal.device): // 2 preds: ^bb0, ^bb4 + %4 = util.cmp.eq %3, %0 : !hal.device + %5 = arith.cmpi slt, %1, %device_count : index + %6 = arith.andi %4, %5 : i1 + cf.cond_br %6, ^bb2, ^bb5 +^bb2: // pred: ^bb1 + %device_n = hal.devices.get %1 : !hal.device + %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false + cf.cond_br %value, ^bb3, ^bb4(%false : i1) +^bb3: // pred: ^bb2 + %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + cf.br ^bb4(%value_1 : i1) +^bb4(%7: i1): // 2 preds: ^bb2, ^bb3 + %8 = arith.cmpi eq, %2, %c0 : index + %9 = arith.select %7, %c1, %c0 : index + %10 = arith.addi %2, %9 : index + %11 = arith.andi %7, %8 : i1 + %12 = arith.select %11, %device_n, %0 : !hal.device + %13 = arith.addi %1, %c1 : index + cf.br ^bb1(%13, %10, %12 : index, index, !hal.device) +^bb5: // pred: ^bb1 + cf.cond_br %4, ^bb6, ^bb7 +^bb6: // pred: ^bb5 + util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>" + cf.br ^bb7 +^bb7: // 2 preds: ^bb5, ^bb6 + util.global.store %3, @__device_0 : !hal.device + util.return +} + +// -----// IR Dump After Canonicalizer (canonicalize) //----- // +util.initializer { + %__device_0 = util.global.load @__device_0 : !hal.device + %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + util.return +} + +// -----// IR Dump After Canonicalizer (canonicalize) //----- // +util.initializer { + %c-1_i64 = arith.constant -1 : i64 + %c-1 = arith.constant -1 : index + %c0 = arith.constant 0 : index + %c14_i32 = arith.constant 14 : i32 + %0 = util.null : !hal.executable + %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + %__device_0 = util.global.load @__device_0 : !hal.device + %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index + %2 = arith.cmpi eq, %1, %c0 : index + cf.cond_br %2, ^bb1, ^bb2 +^bb1: // pred: ^bb0 + %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable + cf.br ^bb3(%executable : !hal.executable) +^bb2: // pred: ^bb0 + util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + cf.br ^bb3(%0 : !hal.executable) +^bb3(%3: !hal.executable): // 2 preds: ^bb1, ^bb2 + util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.return +} + +// -----// IR Dump After Canonicalizer (canonicalize) //----- // +util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} { + %c-1_i64 = arith.constant -1 : i64 + %c3 = arith.constant 3 : index + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c128 = arith.constant 128 : index + %c64_i32 = arith.constant 64 : i32 + %c1 = arith.constant 1 : index + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([ + (%c0 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([ + (%c1 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") + hal.command_buffer.finalize<%cmd : !hal.command_buffer> + util.return %cmd : !hal.command_buffer +} + +// -----// IR Dump After Canonicalizer (canonicalize) //----- // +util.initializer { + %0 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer + util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.return +} + +// -----// IR Dump After Canonicalizer (canonicalize) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c-1_i32 = arith.constant -1 : i32 + %c0_i64 = arith.constant 0 : i64 + %0 = util.null : !hal.fence + %c-1_i64 = arith.constant -1 : i64 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32 + %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer + %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator + hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer + hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128} + %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%__multiple_results_memoize_result_0_device_0) bindings([ + (%buffer : !hal.buffer)[%c0, %c8], + (%buffer_0 : !hal.buffer)[%c0, %c8], + (%transient_buffer : !hal.buffer)[%c0, %c128] + ]) flags("None") + %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32 + util.status.check_ok %status, "failed to wait on timepoint" + %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After Inliner (inline) //----- // +module { + util.global private @__device_0 : !hal.device + util.initializer { + %c18_i32 = arith.constant 18 : i32 + %false = arith.constant false + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %0 = util.null : !hal.device + %device_count = hal.devices.count : index + cf.br ^bb1(%c0, %c0, %0 : index, index, !hal.device) + ^bb1(%1: index, %2: index, %3: !hal.device): // 2 preds: ^bb0, ^bb4 + %4 = util.cmp.eq %3, %0 : !hal.device + %5 = arith.cmpi slt, %1, %device_count : index + %6 = arith.andi %4, %5 : i1 + cf.cond_br %6, ^bb2, ^bb5 + ^bb2: // pred: ^bb1 + %device_n = hal.devices.get %1 : !hal.device + %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false + cf.cond_br %value, ^bb3, ^bb4(%false : i1) + ^bb3: // pred: ^bb2 + %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + cf.br ^bb4(%value_1 : i1) + ^bb4(%7: i1): // 2 preds: ^bb2, ^bb3 + %8 = arith.cmpi eq, %2, %c0 : index + %9 = arith.select %7, %c1, %c0 : index + %10 = arith.addi %2, %9 : index + %11 = arith.andi %7, %8 : i1 + %12 = arith.select %11, %device_n, %0 : !hal.device + %13 = arith.addi %1, %c1 : index + cf.br ^bb1(%13, %10, %12 : index, index, !hal.device) + ^bb5: // pred: ^bb1 + cf.cond_br %4, ^bb6, ^bb7 + ^bb6: // pred: ^bb5 + util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>" + cf.br ^bb7 + ^bb7: // 2 preds: ^bb5, ^bb6 + util.global.store %3, @__device_0 : !hal.device + util.return + } + util.global private @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + util.initializer { + %__device_0 = util.global.load @__device_0 : !hal.device + %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + util.return + } + util.global private @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.initializer { + %c-1_i64 = arith.constant -1 : i64 + %c-1 = arith.constant -1 : index + %c0 = arith.constant 0 : index + %c14_i32 = arith.constant 14 : i32 + %0 = util.null : !hal.executable + %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + %__device_0 = util.global.load @__device_0 : !hal.device + %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index + %2 = arith.cmpi eq, %1, %c0 : index + cf.cond_br %2, ^bb1, ^bb2 + ^bb1: // pred: ^bb0 + %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable + cf.br ^bb3(%executable : !hal.executable) + ^bb2: // pred: ^bb0 + util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + cf.br ^bb3(%0 : !hal.executable) + ^bb3(%3: !hal.executable): // 2 preds: ^bb1, ^bb2 + util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.return + } + hal.executable private @multiple_results_dispatch_0 { + hal.executable.binary public @embedded_elf_arm_64 attributes {data = dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8>, format = "embedded-elf-arm_64", mime_type = "application/x-elf"} + } + util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} { + %c-1_i64 = arith.constant -1 : i64 + %c3 = arith.constant 3 : index + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c128 = arith.constant 128 : index + %c64_i32 = arith.constant 64 : i32 + %c1 = arith.constant 1 : index + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([ + (%c0 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([ + (%c1 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") + hal.command_buffer.finalize<%cmd : !hal.command_buffer> + util.return %cmd : !hal.command_buffer + } + util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.initializer { + %0 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer + util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.return + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c-1_i32 = arith.constant -1 : i32 + %c0_i64 = arith.constant 0 : i64 + %0 = util.null : !hal.fence + %c-1_i64 = arith.constant -1 : i64 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32 + %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer + %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator + hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer + hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128} + %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%__multiple_results_memoize_result_0_device_0) bindings([ + (%buffer : !hal.buffer)[%c0, %c8], + (%buffer_0 : !hal.buffer)[%c0, %c8], + (%transient_buffer : !hal.buffer)[%c0, %c128] + ]) flags("None") + %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32 + util.status.check_ok %status, "failed to wait on timepoint" + %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After SymbolDCE (symbol-dce) //----- // +module { + util.global private @__device_0 : !hal.device + util.initializer { + %c18_i32 = arith.constant 18 : i32 + %false = arith.constant false + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %0 = util.null : !hal.device + %device_count = hal.devices.count : index + cf.br ^bb1(%c0, %c0, %0 : index, index, !hal.device) + ^bb1(%1: index, %2: index, %3: !hal.device): // 2 preds: ^bb0, ^bb4 + %4 = util.cmp.eq %3, %0 : !hal.device + %5 = arith.cmpi slt, %1, %device_count : index + %6 = arith.andi %4, %5 : i1 + cf.cond_br %6, ^bb2, ^bb5 + ^bb2: // pred: ^bb1 + %device_n = hal.devices.get %1 : !hal.device + %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false + cf.cond_br %value, ^bb3, ^bb4(%false : i1) + ^bb3: // pred: ^bb2 + %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + cf.br ^bb4(%value_1 : i1) + ^bb4(%7: i1): // 2 preds: ^bb2, ^bb3 + %8 = arith.cmpi eq, %2, %c0 : index + %9 = arith.select %7, %c1, %c0 : index + %10 = arith.addi %2, %9 : index + %11 = arith.andi %7, %8 : i1 + %12 = arith.select %11, %device_n, %0 : !hal.device + %13 = arith.addi %1, %c1 : index + cf.br ^bb1(%13, %10, %12 : index, index, !hal.device) + ^bb5: // pred: ^bb1 + cf.cond_br %4, ^bb6, ^bb7 + ^bb6: // pred: ^bb5 + util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>" + cf.br ^bb7 + ^bb7: // 2 preds: ^bb5, ^bb6 + util.global.store %3, @__device_0 : !hal.device + util.return + } + util.global private @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + util.initializer { + %__device_0 = util.global.load @__device_0 : !hal.device + %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + util.return + } + util.global private @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.initializer { + %c-1_i64 = arith.constant -1 : i64 + %c-1 = arith.constant -1 : index + %c0 = arith.constant 0 : index + %c14_i32 = arith.constant 14 : i32 + %0 = util.null : !hal.executable + %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + %__device_0 = util.global.load @__device_0 : !hal.device + %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index + %2 = arith.cmpi eq, %1, %c0 : index + cf.cond_br %2, ^bb1, ^bb2 + ^bb1: // pred: ^bb0 + %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable + cf.br ^bb3(%executable : !hal.executable) + ^bb2: // pred: ^bb0 + util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + cf.br ^bb3(%0 : !hal.executable) + ^bb3(%3: !hal.executable): // 2 preds: ^bb1, ^bb2 + util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.return + } + hal.executable private @multiple_results_dispatch_0 { + hal.executable.binary public @embedded_elf_arm_64 attributes {data = dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8>, format = "embedded-elf-arm_64", mime_type = "application/x-elf"} + } + util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} { + %c-1_i64 = arith.constant -1 : i64 + %c3 = arith.constant 3 : index + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c128 = arith.constant 128 : index + %c64_i32 = arith.constant 64 : i32 + %c1 = arith.constant 1 : index + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([ + (%c0 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([ + (%c1 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") + hal.command_buffer.finalize<%cmd : !hal.command_buffer> + util.return %cmd : !hal.command_buffer + } + util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.initializer { + %0 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer + util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.return + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c-1_i32 = arith.constant -1 : i32 + %c0_i64 = arith.constant 0 : i64 + %0 = util.null : !hal.fence + %c-1_i64 = arith.constant -1 : i64 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32 + %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer + %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator + hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer + hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128} + %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%__multiple_results_memoize_result_0_device_0) bindings([ + (%buffer : !hal.buffer)[%c0, %c8], + (%buffer_0 : !hal.buffer)[%c0, %c8], + (%transient_buffer : !hal.buffer)[%c0, %c128] + ]) flags("None") + %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32 + util.status.check_ok %status, "failed to wait on timepoint" + %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After VerifyInitializationOrderPass (iree-util-verify-initialization-order) //----- // +module { + util.global private @__device_0 : !hal.device + util.initializer { + %c18_i32 = arith.constant 18 : i32 + %false = arith.constant false + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %0 = util.null : !hal.device + %device_count = hal.devices.count : index + cf.br ^bb1(%c0, %c0, %0 : index, index, !hal.device) + ^bb1(%1: index, %2: index, %3: !hal.device): // 2 preds: ^bb0, ^bb4 + %4 = util.cmp.eq %3, %0 : !hal.device + %5 = arith.cmpi slt, %1, %device_count : index + %6 = arith.andi %4, %5 : i1 + cf.cond_br %6, ^bb2, ^bb5 + ^bb2: // pred: ^bb1 + %device_n = hal.devices.get %1 : !hal.device + %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false + cf.cond_br %value, ^bb3, ^bb4(%false : i1) + ^bb3: // pred: ^bb2 + %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + cf.br ^bb4(%value_1 : i1) + ^bb4(%7: i1): // 2 preds: ^bb2, ^bb3 + %8 = arith.cmpi eq, %2, %c0 : index + %9 = arith.select %7, %c1, %c0 : index + %10 = arith.addi %2, %9 : index + %11 = arith.andi %7, %8 : i1 + %12 = arith.select %11, %device_n, %0 : !hal.device + %13 = arith.addi %1, %c1 : index + cf.br ^bb1(%13, %10, %12 : index, index, !hal.device) + ^bb5: // pred: ^bb1 + cf.cond_br %4, ^bb6, ^bb7 + ^bb6: // pred: ^bb5 + util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>" + cf.br ^bb7 + ^bb7: // 2 preds: ^bb5, ^bb6 + util.global.store %3, @__device_0 : !hal.device + util.return + } + util.global private @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + util.initializer { + %__device_0 = util.global.load @__device_0 : !hal.device + %ok, %value = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + util.global.store %value, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + util.return + } + util.global private @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.initializer { + %c-1_i64 = arith.constant -1 : i64 + %c-1 = arith.constant -1 : index + %c0 = arith.constant 0 : index + %c14_i32 = arith.constant 14 : i32 + %0 = util.null : !hal.executable + %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + %__device_0 = util.global.load @__device_0 : !hal.device + %1 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index + %2 = arith.cmpi eq, %1, %c0 : index + cf.cond_br %2, ^bb1, ^bb2 + ^bb1: // pred: ^bb0 + %executable = hal.executable.create device(%__device_0 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable + cf.br ^bb3(%executable : !hal.executable) + ^bb2: // pred: ^bb0 + util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + cf.br ^bb3(%0 : !hal.executable) + ^bb3(%3: !hal.executable): // 2 preds: ^bb1, ^bb2 + util.global.store %3, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.return + } + hal.executable private @multiple_results_dispatch_0 { + hal.executable.binary public @embedded_elf_arm_64 attributes {data = dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8>, format = "embedded-elf-arm_64", mime_type = "application/x-elf"} + } + util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} { + %c-1_i64 = arith.constant -1 : i64 + %c3 = arith.constant 3 : index + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c128 = arith.constant 128 : index + %c64_i32 = arith.constant 64 : i32 + %c1 = arith.constant 1 : index + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([ + (%c0 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([ + (%c1 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") + hal.command_buffer.finalize<%cmd : !hal.command_buffer> + util.return %cmd : !hal.command_buffer + } + util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.initializer { + %0 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer + util.global.store %0, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.return + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c-1_i32 = arith.constant -1 : i32 + %c0_i64 = arith.constant 0 : i64 + %0 = util.null : !hal.fence + %c-1_i64 = arith.constant -1 : i64 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32 + %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer + %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator + hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer + hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128} + %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%__multiple_results_memoize_result_0_device_0) bindings([ + (%buffer : !hal.buffer)[%c0, %c8], + (%buffer_0 : !hal.buffer)[%c0, %c8], + (%transient_buffer : !hal.buffer)[%c0, %c128] + ]) flags("None") + %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32 + util.status.check_ok %status, "failed to wait on timepoint" + %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After CombineInitializersPass (iree-util-combine-initializers) //----- // +module { + util.global private @__device_0 : !hal.device + util.global private @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + util.global private @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.initializer { + %c18_i32 = arith.constant 18 : i32 + %false = arith.constant false + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %0 = util.null : !hal.device + %device_count = hal.devices.count : index + cf.br ^bb1(%c0, %c0, %0 : index, index, !hal.device) + ^bb1(%1: index, %2: index, %3: !hal.device): // 2 preds: ^bb0, ^bb4 + %4 = util.cmp.eq %3, %0 : !hal.device + %5 = arith.cmpi slt, %1, %device_count : index + %6 = arith.andi %4, %5 : i1 + cf.cond_br %6, ^bb2, ^bb5 + ^bb2: // pred: ^bb1 + %device_n = hal.devices.get %1 : !hal.device + %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false + cf.cond_br %value, ^bb3, ^bb4(%false : i1) + ^bb3: // pred: ^bb2 + %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + cf.br ^bb4(%value_1 : i1) + ^bb4(%7: i1): // 2 preds: ^bb2, ^bb3 + %8 = arith.cmpi eq, %2, %c0 : index + %9 = arith.select %7, %c1, %c0 : index + %10 = arith.addi %2, %9 : index + %11 = arith.andi %7, %8 : i1 + %12 = arith.select %11, %device_n, %0 : !hal.device + %13 = arith.addi %1, %c1 : index + cf.br ^bb1(%13, %10, %12 : index, index, !hal.device) + ^bb5: // pred: ^bb1 + cf.cond_br %4, ^bb6, ^bb7 + ^bb6: // pred: ^bb5 + util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>" + cf.br ^bb7 + ^bb7: // 2 preds: ^bb5, ^bb6 + util.global.store %3, @__device_0 : !hal.device + cf.br ^bb8 + ^bb8: // pred: ^bb7 + %__device_0 = util.global.load @__device_0 : !hal.device + %ok_2, %value_3 = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + util.global.store %value_3, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + %c-1_i64 = arith.constant -1 : i64 + %c-1 = arith.constant -1 : index + %c0_4 = arith.constant 0 : index + %c14_i32 = arith.constant 14 : i32 + %14 = util.null : !hal.executable + %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + %__device_0_5 = util.global.load @__device_0 : !hal.device + %15 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0_4, %c-1 : index + %16 = arith.cmpi eq, %15, %c0_4 : index + cf.cond_br %16, ^bb9, ^bb10 + ^bb9: // pred: ^bb8 + %executable = hal.executable.create device(%__device_0_5 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable + cf.br ^bb11(%executable : !hal.executable) + ^bb10: // pred: ^bb8 + util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + cf.br ^bb11(%14 : !hal.executable) + ^bb11(%17: !hal.executable): // 2 preds: ^bb9, ^bb10 + util.global.store %17, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + cf.br ^bb12 + ^bb12: // pred: ^bb11 + %18 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer + util.global.store %18, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.return + } + hal.executable private @multiple_results_dispatch_0 { + hal.executable.binary public @embedded_elf_arm_64 attributes {data = dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8>, format = "embedded-elf-arm_64", mime_type = "application/x-elf"} + } + util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} { + %c-1_i64 = arith.constant -1 : i64 + %c3 = arith.constant 3 : index + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c128 = arith.constant 128 : index + %c64_i32 = arith.constant 64 : i32 + %c1 = arith.constant 1 : index + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([ + (%c0 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([ + (%c1 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") + hal.command_buffer.finalize<%cmd : !hal.command_buffer> + util.return %cmd : !hal.command_buffer + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c-1_i32 = arith.constant -1 : i32 + %c0_i64 = arith.constant 0 : i64 + %0 = util.null : !hal.fence + %c-1_i64 = arith.constant -1 : i64 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32 + %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer + %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator + hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer + hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128} + %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%__multiple_results_memoize_result_0_device_0) bindings([ + (%buffer : !hal.buffer)[%c0, %c8], + (%buffer_0 : !hal.buffer)[%c0, %c8], + (%transient_buffer : !hal.buffer)[%c0, %c128] + ]) flags("None") + %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32 + util.status.check_ok %status, "failed to wait on timepoint" + %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After SCFForLoopCanonicalization (scf-for-loop-canonicalization) //----- // +util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} { + %c-1_i64 = arith.constant -1 : i64 + %c3 = arith.constant 3 : index + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c128 = arith.constant 128 : index + %c64_i32 = arith.constant 64 : i32 + %c1 = arith.constant 1 : index + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([ + (%c0 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([ + (%c1 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") + hal.command_buffer.finalize<%cmd : !hal.command_buffer> + util.return %cmd : !hal.command_buffer +} + +// -----// IR Dump After LoopInvariantCodeMotion (loop-invariant-code-motion) //----- // +util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} { + %c-1_i64 = arith.constant -1 : i64 + %c3 = arith.constant 3 : index + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c128 = arith.constant 128 : index + %c64_i32 = arith.constant 64 : i32 + %c1 = arith.constant 1 : index + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([ + (%c0 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([ + (%c1 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") + hal.command_buffer.finalize<%cmd : !hal.command_buffer> + util.return %cmd : !hal.command_buffer +} + +// -----// IR Dump After SCFForLoopCanonicalization (scf-for-loop-canonicalization) //----- // +util.initializer { + %0 = util.null : !hal.executable + %c14_i32 = arith.constant 14 : i32 + %c-1 = arith.constant -1 : index + %c-1_i64 = arith.constant -1 : i64 + %c18_i32 = arith.constant 18 : i32 + %false = arith.constant false + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %1 = util.null : !hal.device + %device_count = hal.devices.count : index + cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device) +^bb1(%2: index, %3: index, %4: !hal.device): // 2 preds: ^bb0, ^bb4 + %5 = util.cmp.eq %4, %1 : !hal.device + %6 = arith.cmpi slt, %2, %device_count : index + %7 = arith.andi %5, %6 : i1 + cf.cond_br %7, ^bb2, ^bb5 +^bb2: // pred: ^bb1 + %device_n = hal.devices.get %2 : !hal.device + %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false + cf.cond_br %value, ^bb3, ^bb4(%false : i1) +^bb3: // pred: ^bb2 + %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + cf.br ^bb4(%value_1 : i1) +^bb4(%8: i1): // 2 preds: ^bb2, ^bb3 + %9 = arith.cmpi eq, %3, %c0 : index + %10 = arith.select %8, %c1, %c0 : index + %11 = arith.addi %3, %10 : index + %12 = arith.andi %8, %9 : i1 + %13 = arith.select %12, %device_n, %1 : !hal.device + %14 = arith.addi %2, %c1 : index + cf.br ^bb1(%14, %11, %13 : index, index, !hal.device) +^bb5: // pred: ^bb1 + cf.cond_br %5, ^bb6, ^bb7 +^bb6: // pred: ^bb5 + util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>" + cf.br ^bb7 +^bb7: // 2 preds: ^bb5, ^bb6 + util.global.store %4, @__device_0 : !hal.device + cf.br ^bb8 +^bb8: // pred: ^bb7 + %__device_0 = util.global.load @__device_0 : !hal.device + %ok_2, %value_3 = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + util.global.store %value_3, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + %__device_0_4 = util.global.load @__device_0 : !hal.device + %15 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index + %16 = arith.cmpi eq, %15, %c0 : index + cf.cond_br %16, ^bb9, ^bb10 +^bb9: // pred: ^bb8 + %executable = hal.executable.create device(%__device_0_4 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable + cf.br ^bb11(%executable : !hal.executable) +^bb10: // pred: ^bb8 + util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + cf.br ^bb11(%0 : !hal.executable) +^bb11(%17: !hal.executable): // 2 preds: ^bb9, ^bb10 + util.global.store %17, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + cf.br ^bb12 +^bb12: // pred: ^bb11 + %18 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer + util.global.store %18, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.return +} + +// -----// IR Dump After SCFForLoopCanonicalization (scf-for-loop-canonicalization) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c-1_i32 = arith.constant -1 : i32 + %c0_i64 = arith.constant 0 : i64 + %0 = util.null : !hal.fence + %c-1_i64 = arith.constant -1 : i64 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32 + %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer + %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator + hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer + hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128} + %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%__multiple_results_memoize_result_0_device_0) bindings([ + (%buffer : !hal.buffer)[%c0, %c8], + (%buffer_0 : !hal.buffer)[%c0, %c8], + (%transient_buffer : !hal.buffer)[%c0, %c128] + ]) flags("None") + %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32 + util.status.check_ok %status, "failed to wait on timepoint" + %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After SCFToControlFlowPass (convert-scf-to-cf) //----- // +util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} { + %c-1_i64 = arith.constant -1 : i64 + %c3 = arith.constant 3 : index + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c128 = arith.constant 128 : index + %c64_i32 = arith.constant 64 : i32 + %c1 = arith.constant 1 : index + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([ + (%c0 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([ + (%c1 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") + hal.command_buffer.finalize<%cmd : !hal.command_buffer> + util.return %cmd : !hal.command_buffer +} + +// -----// IR Dump After LoopInvariantCodeMotion (loop-invariant-code-motion) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c-1_i32 = arith.constant -1 : i32 + %c0_i64 = arith.constant 0 : i64 + %0 = util.null : !hal.fence + %c-1_i64 = arith.constant -1 : i64 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32 + %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer + %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator + hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer + hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128} + %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%__multiple_results_memoize_result_0_device_0) bindings([ + (%buffer : !hal.buffer)[%c0, %c8], + (%buffer_0 : !hal.buffer)[%c0, %c8], + (%transient_buffer : !hal.buffer)[%c0, %c128] + ]) flags("None") + %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32 + util.status.check_ok %status, "failed to wait on timepoint" + %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After IREECodegenAffineExpandIndexOpsPass (iree-codegen-affine-expand-index-ops) //----- // +util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} { + %c-1_i64 = arith.constant -1 : i64 + %c3 = arith.constant 3 : index + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c128 = arith.constant 128 : index + %c64_i32 = arith.constant 64 : i32 + %c1 = arith.constant 1 : index + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([ + (%c0 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([ + (%c1 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") + hal.command_buffer.finalize<%cmd : !hal.command_buffer> + util.return %cmd : !hal.command_buffer +} + +// -----// IR Dump After LoopInvariantCodeMotion (loop-invariant-code-motion) //----- // +util.initializer { + %0 = util.null : !hal.executable + %c14_i32 = arith.constant 14 : i32 + %c-1 = arith.constant -1 : index + %c-1_i64 = arith.constant -1 : i64 + %c18_i32 = arith.constant 18 : i32 + %false = arith.constant false + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %1 = util.null : !hal.device + %device_count = hal.devices.count : index + cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device) +^bb1(%2: index, %3: index, %4: !hal.device): // 2 preds: ^bb0, ^bb4 + %5 = util.cmp.eq %4, %1 : !hal.device + %6 = arith.cmpi slt, %2, %device_count : index + %7 = arith.andi %5, %6 : i1 + cf.cond_br %7, ^bb2, ^bb5 +^bb2: // pred: ^bb1 + %device_n = hal.devices.get %2 : !hal.device + %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false + cf.cond_br %value, ^bb3, ^bb4(%false : i1) +^bb3: // pred: ^bb2 + %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + cf.br ^bb4(%value_1 : i1) +^bb4(%8: i1): // 2 preds: ^bb2, ^bb3 + %9 = arith.cmpi eq, %3, %c0 : index + %10 = arith.select %8, %c1, %c0 : index + %11 = arith.addi %3, %10 : index + %12 = arith.andi %8, %9 : i1 + %13 = arith.select %12, %device_n, %1 : !hal.device + %14 = arith.addi %2, %c1 : index + cf.br ^bb1(%14, %11, %13 : index, index, !hal.device) +^bb5: // pred: ^bb1 + cf.cond_br %5, ^bb6, ^bb7 +^bb6: // pred: ^bb5 + util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>" + cf.br ^bb7 +^bb7: // 2 preds: ^bb5, ^bb6 + util.global.store %4, @__device_0 : !hal.device + cf.br ^bb8 +^bb8: // pred: ^bb7 + %__device_0 = util.global.load @__device_0 : !hal.device + %ok_2, %value_3 = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + util.global.store %value_3, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + %__device_0_4 = util.global.load @__device_0 : !hal.device + %15 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index + %16 = arith.cmpi eq, %15, %c0 : index + cf.cond_br %16, ^bb9, ^bb10 +^bb9: // pred: ^bb8 + %executable = hal.executable.create device(%__device_0_4 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable + cf.br ^bb11(%executable : !hal.executable) +^bb10: // pred: ^bb8 + util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + cf.br ^bb11(%0 : !hal.executable) +^bb11(%17: !hal.executable): // 2 preds: ^bb9, ^bb10 + util.global.store %17, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + cf.br ^bb12 +^bb12: // pred: ^bb11 + %18 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer + util.global.store %18, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.return +} + +// -----// IR Dump After SCFToControlFlowPass (convert-scf-to-cf) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c-1_i32 = arith.constant -1 : i32 + %c0_i64 = arith.constant 0 : i64 + %0 = util.null : !hal.fence + %c-1_i64 = arith.constant -1 : i64 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32 + %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer + %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator + hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer + hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128} + %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%__multiple_results_memoize_result_0_device_0) bindings([ + (%buffer : !hal.buffer)[%c0, %c8], + (%buffer_0 : !hal.buffer)[%c0, %c8], + (%transient_buffer : !hal.buffer)[%c0, %c128] + ]) flags("None") + %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32 + util.status.check_ok %status, "failed to wait on timepoint" + %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After IREECodegenLowerAffinePass (iree-codegen-lower-affine) //----- // +util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} { + %c-1_i64 = arith.constant -1 : i64 + %c3 = arith.constant 3 : index + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c128 = arith.constant 128 : index + %c64_i32 = arith.constant 64 : i32 + %c1 = arith.constant 1 : index + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([ + (%c0 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([ + (%c1 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") + hal.command_buffer.finalize<%cmd : !hal.command_buffer> + util.return %cmd : !hal.command_buffer +} + +// -----// IR Dump After SCFToControlFlowPass (convert-scf-to-cf) //----- // +util.initializer { + %0 = util.null : !hal.executable + %c14_i32 = arith.constant 14 : i32 + %c-1 = arith.constant -1 : index + %c-1_i64 = arith.constant -1 : i64 + %c18_i32 = arith.constant 18 : i32 + %false = arith.constant false + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %1 = util.null : !hal.device + %device_count = hal.devices.count : index + cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device) +^bb1(%2: index, %3: index, %4: !hal.device): // 2 preds: ^bb0, ^bb4 + %5 = util.cmp.eq %4, %1 : !hal.device + %6 = arith.cmpi slt, %2, %device_count : index + %7 = arith.andi %5, %6 : i1 + cf.cond_br %7, ^bb2, ^bb5 +^bb2: // pred: ^bb1 + %device_n = hal.devices.get %2 : !hal.device + %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false + cf.cond_br %value, ^bb3, ^bb4(%false : i1) +^bb3: // pred: ^bb2 + %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + cf.br ^bb4(%value_1 : i1) +^bb4(%8: i1): // 2 preds: ^bb2, ^bb3 + %9 = arith.cmpi eq, %3, %c0 : index + %10 = arith.select %8, %c1, %c0 : index + %11 = arith.addi %3, %10 : index + %12 = arith.andi %8, %9 : i1 + %13 = arith.select %12, %device_n, %1 : !hal.device + %14 = arith.addi %2, %c1 : index + cf.br ^bb1(%14, %11, %13 : index, index, !hal.device) +^bb5: // pred: ^bb1 + cf.cond_br %5, ^bb6, ^bb7 +^bb6: // pred: ^bb5 + util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>" + cf.br ^bb7 +^bb7: // 2 preds: ^bb5, ^bb6 + util.global.store %4, @__device_0 : !hal.device + cf.br ^bb8 +^bb8: // pred: ^bb7 + %__device_0 = util.global.load @__device_0 : !hal.device + %ok_2, %value_3 = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + util.global.store %value_3, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + %__device_0_4 = util.global.load @__device_0 : !hal.device + %15 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index + %16 = arith.cmpi eq, %15, %c0 : index + cf.cond_br %16, ^bb9, ^bb10 +^bb9: // pred: ^bb8 + %executable = hal.executable.create device(%__device_0_4 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable + cf.br ^bb11(%executable : !hal.executable) +^bb10: // pred: ^bb8 + util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + cf.br ^bb11(%0 : !hal.executable) +^bb11(%17: !hal.executable): // 2 preds: ^bb9, ^bb10 + util.global.store %17, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + cf.br ^bb12 +^bb12: // pred: ^bb11 + %18 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer + util.global.store %18, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.return +} + +// -----// IR Dump After IREECodegenAffineExpandIndexOpsPass (iree-codegen-affine-expand-index-ops) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c-1_i32 = arith.constant -1 : i32 + %c0_i64 = arith.constant 0 : i64 + %0 = util.null : !hal.fence + %c-1_i64 = arith.constant -1 : i64 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32 + %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer + %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator + hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer + hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128} + %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%__multiple_results_memoize_result_0_device_0) bindings([ + (%buffer : !hal.buffer)[%c0, %c8], + (%buffer_0 : !hal.buffer)[%c0, %c8], + (%transient_buffer : !hal.buffer)[%c0, %c128] + ]) flags("None") + %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32 + util.status.check_ok %status, "failed to wait on timepoint" + %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After IREECodegenAffineExpandIndexOpsPass (iree-codegen-affine-expand-index-ops) //----- // +util.initializer { + %0 = util.null : !hal.executable + %c14_i32 = arith.constant 14 : i32 + %c-1 = arith.constant -1 : index + %c-1_i64 = arith.constant -1 : i64 + %c18_i32 = arith.constant 18 : i32 + %false = arith.constant false + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %1 = util.null : !hal.device + %device_count = hal.devices.count : index + cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device) +^bb1(%2: index, %3: index, %4: !hal.device): // 2 preds: ^bb0, ^bb4 + %5 = util.cmp.eq %4, %1 : !hal.device + %6 = arith.cmpi slt, %2, %device_count : index + %7 = arith.andi %5, %6 : i1 + cf.cond_br %7, ^bb2, ^bb5 +^bb2: // pred: ^bb1 + %device_n = hal.devices.get %2 : !hal.device + %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false + cf.cond_br %value, ^bb3, ^bb4(%false : i1) +^bb3: // pred: ^bb2 + %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + cf.br ^bb4(%value_1 : i1) +^bb4(%8: i1): // 2 preds: ^bb2, ^bb3 + %9 = arith.cmpi eq, %3, %c0 : index + %10 = arith.select %8, %c1, %c0 : index + %11 = arith.addi %3, %10 : index + %12 = arith.andi %8, %9 : i1 + %13 = arith.select %12, %device_n, %1 : !hal.device + %14 = arith.addi %2, %c1 : index + cf.br ^bb1(%14, %11, %13 : index, index, !hal.device) +^bb5: // pred: ^bb1 + cf.cond_br %5, ^bb6, ^bb7 +^bb6: // pred: ^bb5 + util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>" + cf.br ^bb7 +^bb7: // 2 preds: ^bb5, ^bb6 + util.global.store %4, @__device_0 : !hal.device + cf.br ^bb8 +^bb8: // pred: ^bb7 + %__device_0 = util.global.load @__device_0 : !hal.device + %ok_2, %value_3 = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + util.global.store %value_3, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + %__device_0_4 = util.global.load @__device_0 : !hal.device + %15 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index + %16 = arith.cmpi eq, %15, %c0 : index + cf.cond_br %16, ^bb9, ^bb10 +^bb9: // pred: ^bb8 + %executable = hal.executable.create device(%__device_0_4 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable + cf.br ^bb11(%executable : !hal.executable) +^bb10: // pred: ^bb8 + util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + cf.br ^bb11(%0 : !hal.executable) +^bb11(%17: !hal.executable): // 2 preds: ^bb9, ^bb10 + util.global.store %17, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + cf.br ^bb12 +^bb12: // pred: ^bb11 + %18 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer + util.global.store %18, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.return +} + +// -----// IR Dump After IREECodegenLowerAffinePass (iree-codegen-lower-affine) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c-1_i32 = arith.constant -1 : i32 + %c0_i64 = arith.constant 0 : i64 + %0 = util.null : !hal.fence + %c-1_i64 = arith.constant -1 : i64 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32 + %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer + %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator + hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer + hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128} + %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%__multiple_results_memoize_result_0_device_0) bindings([ + (%buffer : !hal.buffer)[%c0, %c8], + (%buffer_0 : !hal.buffer)[%c0, %c8], + (%transient_buffer : !hal.buffer)[%c0, %c128] + ]) flags("None") + %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32 + util.status.check_ok %status, "failed to wait on timepoint" + %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After IREECodegenLowerAffinePass (iree-codegen-lower-affine) //----- // +util.initializer { + %0 = util.null : !hal.executable + %c14_i32 = arith.constant 14 : i32 + %c-1 = arith.constant -1 : index + %c-1_i64 = arith.constant -1 : i64 + %c18_i32 = arith.constant 18 : i32 + %false = arith.constant false + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %1 = util.null : !hal.device + %device_count = hal.devices.count : index + cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device) +^bb1(%2: index, %3: index, %4: !hal.device): // 2 preds: ^bb0, ^bb4 + %5 = util.cmp.eq %4, %1 : !hal.device + %6 = arith.cmpi slt, %2, %device_count : index + %7 = arith.andi %5, %6 : i1 + cf.cond_br %7, ^bb2, ^bb5 +^bb2: // pred: ^bb1 + %device_n = hal.devices.get %2 : !hal.device + %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false + cf.cond_br %value, ^bb3, ^bb4(%false : i1) +^bb3: // pred: ^bb2 + %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + cf.br ^bb4(%value_1 : i1) +^bb4(%8: i1): // 2 preds: ^bb2, ^bb3 + %9 = arith.cmpi eq, %3, %c0 : index + %10 = arith.select %8, %c1, %c0 : index + %11 = arith.addi %3, %10 : index + %12 = arith.andi %8, %9 : i1 + %13 = arith.select %12, %device_n, %1 : !hal.device + %14 = arith.addi %2, %c1 : index + cf.br ^bb1(%14, %11, %13 : index, index, !hal.device) +^bb5: // pred: ^bb1 + cf.cond_br %5, ^bb6, ^bb7 +^bb6: // pred: ^bb5 + util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>" + cf.br ^bb7 +^bb7: // 2 preds: ^bb5, ^bb6 + util.global.store %4, @__device_0 : !hal.device + cf.br ^bb8 +^bb8: // pred: ^bb7 + %__device_0 = util.global.load @__device_0 : !hal.device + %ok_2, %value_3 = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + util.global.store %value_3, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + %__device_0_4 = util.global.load @__device_0 : !hal.device + %15 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index + %16 = arith.cmpi eq, %15, %c0 : index + cf.cond_br %16, ^bb9, ^bb10 +^bb9: // pred: ^bb8 + %executable = hal.executable.create device(%__device_0_4 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable + cf.br ^bb11(%executable : !hal.executable) +^bb10: // pred: ^bb8 + util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + cf.br ^bb11(%0 : !hal.executable) +^bb11(%17: !hal.executable): // 2 preds: ^bb9, ^bb10 + util.global.store %17, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + cf.br ^bb12 +^bb12: // pred: ^bb11 + %18 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer + util.global.store %18, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.return +} + +// -----// IR Dump After ArithUnsignedWhenEquivalentPass (arith-unsigned-when-equivalent) //----- // +module { + util.global private @__device_0 : !hal.device + util.global private @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + util.global private @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.initializer { + %0 = util.null : !hal.executable + %c14_i32 = arith.constant 14 : i32 + %c-1 = arith.constant -1 : index + %c-1_i64 = arith.constant -1 : i64 + %c18_i32 = arith.constant 18 : i32 + %false = arith.constant false + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %1 = util.null : !hal.device + %device_count = hal.devices.count : index + cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device) + ^bb1(%2: index, %3: index, %4: !hal.device): // 2 preds: ^bb0, ^bb4 + %5 = util.cmp.eq %4, %1 : !hal.device + %6 = arith.cmpi slt, %2, %device_count : index + %7 = arith.andi %5, %6 : i1 + cf.cond_br %7, ^bb2, ^bb5 + ^bb2: // pred: ^bb1 + %device_n = hal.devices.get %2 : !hal.device + %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false + cf.cond_br %value, ^bb3, ^bb4(%false : i1) + ^bb3: // pred: ^bb2 + %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + cf.br ^bb4(%value_1 : i1) + ^bb4(%8: i1): // 2 preds: ^bb2, ^bb3 + %9 = arith.cmpi eq, %3, %c0 : index + %10 = arith.select %8, %c1, %c0 : index + %11 = arith.addi %3, %10 : index + %12 = arith.andi %8, %9 : i1 + %13 = arith.select %12, %device_n, %1 : !hal.device + %14 = arith.addi %2, %c1 : index + cf.br ^bb1(%14, %11, %13 : index, index, !hal.device) + ^bb5: // pred: ^bb1 + cf.cond_br %5, ^bb6, ^bb7 + ^bb6: // pred: ^bb5 + util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>" + cf.br ^bb7 + ^bb7: // 2 preds: ^bb5, ^bb6 + util.global.store %4, @__device_0 : !hal.device + cf.br ^bb8 + ^bb8: // pred: ^bb7 + %__device_0 = util.global.load @__device_0 : !hal.device + %ok_2, %value_3 = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + util.global.store %value_3, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + %__device_0_4 = util.global.load @__device_0 : !hal.device + %15 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index + %16 = arith.cmpi eq, %15, %c0 : index + cf.cond_br %16, ^bb9, ^bb10 + ^bb9: // pred: ^bb8 + %executable = hal.executable.create device(%__device_0_4 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable + cf.br ^bb11(%executable : !hal.executable) + ^bb10: // pred: ^bb8 + util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + cf.br ^bb11(%0 : !hal.executable) + ^bb11(%17: !hal.executable): // 2 preds: ^bb9, ^bb10 + util.global.store %17, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + cf.br ^bb12 + ^bb12: // pred: ^bb11 + %18 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer + util.global.store %18, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.return + } + hal.executable private @multiple_results_dispatch_0 { + hal.executable.binary public @embedded_elf_arm_64 attributes {data = dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8>, format = "embedded-elf-arm_64", mime_type = "application/x-elf"} + } + util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} { + %c-1_i64 = arith.constant -1 : i64 + %c3 = arith.constant 3 : index + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c128 = arith.constant 128 : index + %c64_i32 = arith.constant 64 : i32 + %c1 = arith.constant 1 : index + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([ + (%c0 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([ + (%c1 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") + hal.command_buffer.finalize<%cmd : !hal.command_buffer> + util.return %cmd : !hal.command_buffer + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c-1_i32 = arith.constant -1 : i32 + %c0_i64 = arith.constant 0 : i64 + %0 = util.null : !hal.fence + %c-1_i64 = arith.constant -1 : i64 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32 + %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer + %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator + hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer + hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128} + %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%__multiple_results_memoize_result_0_device_0) bindings([ + (%buffer : !hal.buffer)[%c0, %c8], + (%buffer_0 : !hal.buffer)[%c0, %c8], + (%transient_buffer : !hal.buffer)[%c0, %c128] + ]) flags("None") + %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32 + util.status.check_ok %status, "failed to wait on timepoint" + %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After PropagateSubrangesPass (iree-util-propagate-subranges) //----- // +module { + util.global private @__device_0 : !hal.device + util.global private @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + util.global private @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.initializer { + %0 = util.null : !hal.executable + %c14_i32 = arith.constant 14 : i32 + %c-1 = arith.constant -1 : index + %c-1_i64 = arith.constant -1 : i64 + %c18_i32 = arith.constant 18 : i32 + %false = arith.constant false + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %1 = util.null : !hal.device + %device_count = hal.devices.count : index + cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device) + ^bb1(%2: index, %3: index, %4: !hal.device): // 2 preds: ^bb0, ^bb4 + %5 = util.cmp.eq %4, %1 : !hal.device + %6 = arith.cmpi slt, %2, %device_count : index + %7 = arith.andi %5, %6 : i1 + cf.cond_br %7, ^bb2, ^bb5 + ^bb2: // pred: ^bb1 + %device_n = hal.devices.get %2 : !hal.device + %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false + cf.cond_br %value, ^bb3, ^bb4(%false : i1) + ^bb3: // pred: ^bb2 + %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + cf.br ^bb4(%value_1 : i1) + ^bb4(%8: i1): // 2 preds: ^bb2, ^bb3 + %9 = arith.cmpi eq, %3, %c0 : index + %10 = arith.select %8, %c1, %c0 : index + %11 = arith.addi %3, %10 : index + %12 = arith.andi %8, %9 : i1 + %13 = arith.select %12, %device_n, %1 : !hal.device + %14 = arith.addi %2, %c1 : index + cf.br ^bb1(%14, %11, %13 : index, index, !hal.device) + ^bb5: // pred: ^bb1 + cf.cond_br %5, ^bb6, ^bb7 + ^bb6: // pred: ^bb5 + util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>" + cf.br ^bb7 + ^bb7: // 2 preds: ^bb5, ^bb6 + util.global.store %4, @__device_0 : !hal.device + cf.br ^bb8 + ^bb8: // pred: ^bb7 + %__device_0 = util.global.load @__device_0 : !hal.device + %ok_2, %value_3 = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + util.global.store %value_3, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + %__device_0_4 = util.global.load @__device_0 : !hal.device + %15 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index + %16 = arith.cmpi eq, %15, %c0 : index + cf.cond_br %16, ^bb9, ^bb10 + ^bb9: // pred: ^bb8 + %executable = hal.executable.create device(%__device_0_4 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable + cf.br ^bb11(%executable : !hal.executable) + ^bb10: // pred: ^bb8 + util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + cf.br ^bb11(%0 : !hal.executable) + ^bb11(%17: !hal.executable): // 2 preds: ^bb9, ^bb10 + util.global.store %17, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + cf.br ^bb12 + ^bb12: // pred: ^bb11 + %18 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer + util.global.store %18, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.return + } + hal.executable private @multiple_results_dispatch_0 { + hal.executable.binary public @embedded_elf_arm_64 attributes {data = dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8>, format = "embedded-elf-arm_64", mime_type = "application/x-elf"} + } + util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} { + %c-1_i64 = arith.constant -1 : i64 + %c3 = arith.constant 3 : index + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c128 = arith.constant 128 : index + %c64_i32 = arith.constant 64 : i32 + %c1 = arith.constant 1 : index + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([ + (%c0 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([ + (%c1 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") + hal.command_buffer.finalize<%cmd : !hal.command_buffer> + util.return %cmd : !hal.command_buffer + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c-1_i32 = arith.constant -1 : i32 + %c0_i64 = arith.constant 0 : i64 + %0 = util.null : !hal.fence + %c-1_i64 = arith.constant -1 : i64 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32 + %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer + %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator + hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer + hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128} + %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%__multiple_results_memoize_result_0_device_0) bindings([ + (%buffer : !hal.buffer)[%c0, %c8], + (%buffer_0 : !hal.buffer)[%c0, %c8], + (%transient_buffer : !hal.buffer)[%c0, %c128] + ]) flags("None") + %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32 + util.status.check_ok %status, "failed to wait on timepoint" + %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After Canonicalizer (canonicalize) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c-1_i32 = arith.constant -1 : i32 + %c0_i64 = arith.constant 0 : i64 + %0 = util.null : !hal.fence + %c-1_i64 = arith.constant -1 : i64 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32 + %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer + %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator + hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer + hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128} + %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%__multiple_results_memoize_result_0_device_0) bindings([ + (%buffer : !hal.buffer)[%c0, %c8], + (%buffer_0 : !hal.buffer)[%c0, %c8], + (%transient_buffer : !hal.buffer)[%c0, %c128] + ]) flags("None") + %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32 + util.status.check_ok %status, "failed to wait on timepoint" + %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After Canonicalizer (canonicalize) //----- // +util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} { + %c-1_i64 = arith.constant -1 : i64 + %c3 = arith.constant 3 : index + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c128 = arith.constant 128 : index + %c64_i32 = arith.constant 64 : i32 + %c1 = arith.constant 1 : index + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([ + (%c0 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([ + (%c1 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") + hal.command_buffer.finalize<%cmd : !hal.command_buffer> + util.return %cmd : !hal.command_buffer +} + +// -----// IR Dump After Canonicalizer (canonicalize) //----- // +util.initializer { + %0 = util.null : !hal.executable + %c14_i32 = arith.constant 14 : i32 + %c-1 = arith.constant -1 : index + %c-1_i64 = arith.constant -1 : i64 + %c18_i32 = arith.constant 18 : i32 + %false = arith.constant false + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %1 = util.null : !hal.device + %device_count = hal.devices.count : index + cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device) +^bb1(%2: index, %3: index, %4: !hal.device): // 2 preds: ^bb0, ^bb4 + %5 = util.cmp.eq %4, %1 : !hal.device + %6 = arith.cmpi slt, %2, %device_count : index + %7 = arith.andi %5, %6 : i1 + cf.cond_br %7, ^bb2, ^bb5 +^bb2: // pred: ^bb1 + %device_n = hal.devices.get %2 : !hal.device + %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false + cf.cond_br %value, ^bb3, ^bb4(%false : i1) +^bb3: // pred: ^bb2 + %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + cf.br ^bb4(%value_1 : i1) +^bb4(%8: i1): // 2 preds: ^bb2, ^bb3 + %9 = arith.cmpi eq, %3, %c0 : index + %10 = arith.select %8, %c1, %c0 : index + %11 = arith.addi %3, %10 : index + %12 = arith.andi %8, %9 : i1 + %13 = arith.select %12, %device_n, %1 : !hal.device + %14 = arith.addi %2, %c1 : index + cf.br ^bb1(%14, %11, %13 : index, index, !hal.device) +^bb5: // pred: ^bb1 + cf.cond_br %5, ^bb6, ^bb7 +^bb6: // pred: ^bb5 + util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>" + cf.br ^bb7 +^bb7: // 2 preds: ^bb5, ^bb6 + util.global.store %4, @__device_0 : !hal.device + %__device_0 = util.global.load @__device_0 : !hal.device + %ok_2, %value_3 = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + util.global.store %value_3, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + %__device_0_4 = util.global.load @__device_0 : !hal.device + %15 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index + %16 = arith.cmpi eq, %15, %c0 : index + cf.cond_br %16, ^bb8, ^bb9 +^bb8: // pred: ^bb7 + %executable = hal.executable.create device(%__device_0_4 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable + cf.br ^bb10(%executable : !hal.executable) +^bb9: // pred: ^bb7 + util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + cf.br ^bb10(%0 : !hal.executable) +^bb10(%17: !hal.executable): // 2 preds: ^bb8, ^bb9 + util.global.store %17, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + %18 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer + util.global.store %18, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.return +} + +// -----// IR Dump After CSE (cse) //----- // +util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} { + %c-1_i64 = arith.constant -1 : i64 + %c3 = arith.constant 3 : index + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c128 = arith.constant 128 : index + %c64_i32 = arith.constant 64 : i32 + %c1 = arith.constant 1 : index + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([ + (%c0 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([ + (%c1 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") + hal.command_buffer.finalize<%cmd : !hal.command_buffer> + util.return %cmd : !hal.command_buffer +} + +// -----// IR Dump After CSE (cse) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c-1_i32 = arith.constant -1 : i32 + %c0_i64 = arith.constant 0 : i64 + %0 = util.null : !hal.fence + %c-1_i64 = arith.constant -1 : i64 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32 + %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer + %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator + hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer + hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128} + %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%__multiple_results_memoize_result_0_device_0) bindings([ + (%buffer : !hal.buffer)[%c0, %c8], + (%buffer_0 : !hal.buffer)[%c0, %c8], + (%transient_buffer : !hal.buffer)[%c0, %c128] + ]) flags("None") + %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32 + util.status.check_ok %status, "failed to wait on timepoint" + %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After CSE (cse) //----- // +util.initializer { + %0 = util.null : !hal.executable + %c14_i32 = arith.constant 14 : i32 + %c-1 = arith.constant -1 : index + %c-1_i64 = arith.constant -1 : i64 + %c18_i32 = arith.constant 18 : i32 + %false = arith.constant false + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %1 = util.null : !hal.device + %device_count = hal.devices.count : index + cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device) +^bb1(%2: index, %3: index, %4: !hal.device): // 2 preds: ^bb0, ^bb4 + %5 = util.cmp.eq %4, %1 : !hal.device + %6 = arith.cmpi slt, %2, %device_count : index + %7 = arith.andi %5, %6 : i1 + cf.cond_br %7, ^bb2, ^bb5 +^bb2: // pred: ^bb1 + %device_n = hal.devices.get %2 : !hal.device + %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false + cf.cond_br %value, ^bb3, ^bb4(%false : i1) +^bb3: // pred: ^bb2 + %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + cf.br ^bb4(%value_1 : i1) +^bb4(%8: i1): // 2 preds: ^bb2, ^bb3 + %9 = arith.cmpi eq, %3, %c0 : index + %10 = arith.select %8, %c1, %c0 : index + %11 = arith.addi %3, %10 : index + %12 = arith.andi %8, %9 : i1 + %13 = arith.select %12, %device_n, %1 : !hal.device + %14 = arith.addi %2, %c1 : index + cf.br ^bb1(%14, %11, %13 : index, index, !hal.device) +^bb5: // pred: ^bb1 + cf.cond_br %5, ^bb6, ^bb7 +^bb6: // pred: ^bb5 + util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>" + cf.br ^bb7 +^bb7: // 2 preds: ^bb5, ^bb6 + util.global.store %4, @__device_0 : !hal.device + %__device_0 = util.global.load @__device_0 : !hal.device + %ok_2, %value_3 = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + util.global.store %value_3, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + %__device_0_4 = util.global.load @__device_0 : !hal.device + %15 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index + %16 = arith.cmpi eq, %15, %c0 : index + cf.cond_br %16, ^bb8, ^bb9 +^bb8: // pred: ^bb7 + %executable = hal.executable.create device(%__device_0_4 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable + cf.br ^bb10(%executable : !hal.executable) +^bb9: // pred: ^bb7 + util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + cf.br ^bb10(%0 : !hal.executable) +^bb10(%17: !hal.executable): // 2 preds: ^bb8, ^bb9 + util.global.store %17, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + %18 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer + util.global.store %18, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.return +} + +// -----// IR Dump After SymbolDCE (symbol-dce) //----- // +module { + util.global private @__device_0 : !hal.device + util.global private @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + util.global private @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.initializer { + %0 = util.null : !hal.executable + %c14_i32 = arith.constant 14 : i32 + %c-1 = arith.constant -1 : index + %c-1_i64 = arith.constant -1 : i64 + %c18_i32 = arith.constant 18 : i32 + %false = arith.constant false + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %1 = util.null : !hal.device + %device_count = hal.devices.count : index + cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device) + ^bb1(%2: index, %3: index, %4: !hal.device): // 2 preds: ^bb0, ^bb4 + %5 = util.cmp.eq %4, %1 : !hal.device + %6 = arith.cmpi slt, %2, %device_count : index + %7 = arith.andi %5, %6 : i1 + cf.cond_br %7, ^bb2, ^bb5 + ^bb2: // pred: ^bb1 + %device_n = hal.devices.get %2 : !hal.device + %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false + cf.cond_br %value, ^bb3, ^bb4(%false : i1) + ^bb3: // pred: ^bb2 + %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + cf.br ^bb4(%value_1 : i1) + ^bb4(%8: i1): // 2 preds: ^bb2, ^bb3 + %9 = arith.cmpi eq, %3, %c0 : index + %10 = arith.select %8, %c1, %c0 : index + %11 = arith.addi %3, %10 : index + %12 = arith.andi %8, %9 : i1 + %13 = arith.select %12, %device_n, %1 : !hal.device + %14 = arith.addi %2, %c1 : index + cf.br ^bb1(%14, %11, %13 : index, index, !hal.device) + ^bb5: // pred: ^bb1 + cf.cond_br %5, ^bb6, ^bb7 + ^bb6: // pred: ^bb5 + util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>" + cf.br ^bb7 + ^bb7: // 2 preds: ^bb5, ^bb6 + util.global.store %4, @__device_0 : !hal.device + %__device_0 = util.global.load @__device_0 : !hal.device + %ok_2, %value_3 = hal.device.query<%__device_0 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + util.global.store %value_3, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + %__device_0_query_0_hal_executable_format_embedded_elf_arm_64 = util.global.load @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + %__device_0_4 = util.global.load @__device_0 : !hal.device + %15 = arith.select %__device_0_query_0_hal_executable_format_embedded_elf_arm_64, %c0, %c-1 : index + %16 = arith.cmpi eq, %15, %c0 : index + cf.cond_br %16, ^bb8, ^bb9 + ^bb8: // pred: ^bb7 + %executable = hal.executable.create device(%__device_0_4 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable + cf.br ^bb10(%executable : !hal.executable) + ^bb9: // pred: ^bb7 + util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + cf.br ^bb10(%0 : !hal.executable) + ^bb10(%17: !hal.executable): // 2 preds: ^bb8, ^bb9 + util.global.store %17, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + %18 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer + util.global.store %18, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.return + } + hal.executable private @multiple_results_dispatch_0 { + hal.executable.binary public @embedded_elf_arm_64 attributes {data = dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8>, format = "embedded-elf-arm_64", mime_type = "application/x-elf"} + } + util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} { + %c-1_i64 = arith.constant -1 : i64 + %c3 = arith.constant 3 : index + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c128 = arith.constant 128 : index + %c64_i32 = arith.constant 64 : i32 + %c1 = arith.constant 1 : index + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([ + (%c0 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([ + (%c1 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") + hal.command_buffer.finalize<%cmd : !hal.command_buffer> + util.return %cmd : !hal.command_buffer + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c-1_i32 = arith.constant -1 : i32 + %c0_i64 = arith.constant 0 : i64 + %0 = util.null : !hal.fence + %c-1_i64 = arith.constant -1 : i64 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32 + %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer + %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator + hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer + hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128} + %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%__multiple_results_memoize_result_0_device_0) bindings([ + (%buffer : !hal.buffer)[%c0, %c8], + (%buffer_0 : !hal.buffer)[%c0, %c8], + (%transient_buffer : !hal.buffer)[%c0, %c128] + ]) flags("None") + %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32 + util.status.check_ok %status, "failed to wait on timepoint" + %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // +util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} { + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + %c-1_i64 = arith.constant -1 : i64 + %c3 = arith.constant 3 : index + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c128 = arith.constant 128 : index + %c64_i32 = arith.constant 64 : i32 + %c1 = arith.constant 1 : index + %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([ + (%c0 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([ + (%c1 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") + hal.command_buffer.finalize<%cmd : !hal.command_buffer> + util.return %cmd : !hal.command_buffer +} + +// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // +util.initializer { + %0 = util.null : !hal.executable + %c14_i32 = arith.constant 14 : i32 + %c-1 = arith.constant -1 : index + %c-1_i64 = arith.constant -1 : i64 + %c18_i32 = arith.constant 18 : i32 + %false = arith.constant false + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %1 = util.null : !hal.device + %device_count = hal.devices.count : index + cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device) +^bb1(%2: index, %3: index, %4: !hal.device): // 2 preds: ^bb0, ^bb4 + %5 = util.cmp.eq %4, %1 : !hal.device + %6 = arith.cmpi slt, %2, %device_count : index + %7 = arith.andi %5, %6 : i1 + cf.cond_br %7, ^bb2, ^bb5 +^bb2: // pred: ^bb1 + %device_n = hal.devices.get %2 : !hal.device + %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false + cf.cond_br %value, ^bb3, ^bb4(%false : i1) +^bb3: // pred: ^bb2 + %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + cf.br ^bb4(%value_1 : i1) +^bb4(%8: i1): // 2 preds: ^bb2, ^bb3 + %9 = arith.cmpi eq, %3, %c0 : index + %10 = arith.select %8, %c1, %c0 : index + %11 = arith.addi %3, %10 : index + %12 = arith.andi %8, %9 : i1 + %13 = arith.select %12, %device_n, %1 : !hal.device + %14 = arith.addi %2, %c1 : index + cf.br ^bb1(%14, %11, %13 : index, index, !hal.device) +^bb5: // pred: ^bb1 + cf.cond_br %5, ^bb6, ^bb7 +^bb6: // pred: ^bb5 + util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>" + cf.br ^bb7 +^bb7: // 2 preds: ^bb5, ^bb6 + %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + %15 = arith.select %value_3, %c0, %c-1 : index + %16 = arith.cmpi eq, %15, %c0 : index + util.global.store %4, @__device_0 : !hal.device + util.global.store %value_3, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + cf.cond_br %16, ^bb8, ^bb9 +^bb8: // pred: ^bb7 + %executable = hal.executable.create device(%4 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable + cf.br ^bb10(%executable : !hal.executable) +^bb9: // pred: ^bb7 + util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + cf.br ^bb10(%0 : !hal.executable) +^bb10(%17: !hal.executable): // 2 preds: ^bb8, ^bb9 + util.global.store %17, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + %18 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer + util.global.store %18, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.return +} + +// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + %c-1_i32 = arith.constant -1 : i32 + %c0_i64 = arith.constant 0 : i64 + %0 = util.null : !hal.fence + %c-1_i64 = arith.constant -1 : i64 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32 + %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32 + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer + %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator + hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer + hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128} + %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%__multiple_results_memoize_result_0_device_0) bindings([ + (%buffer : !hal.buffer)[%c0, %c8], + (%buffer_0 : !hal.buffer)[%c0, %c8], + (%transient_buffer : !hal.buffer)[%c0, %c128] + ]) flags("None") + %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32 + util.status.check_ok %status, "failed to wait on timepoint" + %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // +util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} { + %c1 = arith.constant 1 : index + %c64_i32 = arith.constant 64 : i32 + %c128 = arith.constant 128 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %c0 = arith.constant 0 : index + %c0_i32 = arith.constant 0 : i32 + %c3 = arith.constant 3 : index + %c-1_i64 = arith.constant -1 : i64 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([ + (%c0 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([ + (%c1 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") + hal.command_buffer.finalize<%cmd : !hal.command_buffer> + util.return %cmd : !hal.command_buffer +} + +// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // +util.initializer { + %0 = util.null : !hal.executable + %c14_i32 = arith.constant 14 : i32 + %c-1 = arith.constant -1 : index + %c-1_i64 = arith.constant -1 : i64 + %c18_i32 = arith.constant 18 : i32 + %false = arith.constant false + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %1 = util.null : !hal.device + %device_count = hal.devices.count : index + cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device) +^bb1(%2: index, %3: index, %4: !hal.device): // 2 preds: ^bb0, ^bb4 + %5 = util.cmp.eq %4, %1 : !hal.device + %6 = arith.cmpi slt, %2, %device_count : index + %7 = arith.andi %5, %6 : i1 + cf.cond_br %7, ^bb2, ^bb5 +^bb2: // pred: ^bb1 + %device_n = hal.devices.get %2 : !hal.device + %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false + cf.cond_br %value, ^bb3, ^bb4(%false : i1) +^bb3: // pred: ^bb2 + %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + cf.br ^bb4(%value_1 : i1) +^bb4(%8: i1): // 2 preds: ^bb2, ^bb3 + %9 = arith.cmpi eq, %3, %c0 : index + %10 = arith.select %8, %c1, %c0 : index + %11 = arith.addi %3, %10 : index + %12 = arith.andi %8, %9 : i1 + %13 = arith.select %12, %device_n, %1 : !hal.device + %14 = arith.addi %2, %c1 : index + cf.br ^bb1(%14, %11, %13 : index, index, !hal.device) +^bb5: // pred: ^bb1 + cf.cond_br %5, ^bb6, ^bb7 +^bb6: // pred: ^bb5 + util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>" + cf.br ^bb7 +^bb7: // 2 preds: ^bb5, ^bb6 + %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + %15 = arith.select %value_3, %c0, %c-1 : index + %16 = arith.cmpi eq, %15, %c0 : index + util.global.store %4, @__device_0 : !hal.device + util.global.store %value_3, @__device_0_query_0_hal_executable_format_embedded_elf_arm_64 : i1 + cf.cond_br %16, ^bb8, ^bb9 +^bb8: // pred: ^bb7 + %executable = hal.executable.create device(%4 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable + cf.br ^bb10(%executable : !hal.executable) +^bb9: // pred: ^bb7 + util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + cf.br ^bb10(%0 : !hal.executable) +^bb10(%17: !hal.executable): // 2 preds: ^bb8, ^bb9 + util.global.store %17, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + %18 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer + util.global.store %18, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.return +} + +// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // +util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32 + %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32 + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c64 = arith.constant 64 : index + %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index + %c-1_i64 = arith.constant -1 : i64 + %0 = util.null : !hal.fence + %c0_i64 = arith.constant 0 : i64 + %c-1_i32 = arith.constant -1 : i32 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer + %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator + hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer + hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128} + %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%__multiple_results_memoize_result_0_device_0) bindings([ + (%buffer : !hal.buffer)[%c0, %c8], + (%buffer_0 : !hal.buffer)[%c0, %c8], + (%transient_buffer : !hal.buffer)[%c0, %c128] + ]) flags("None") + %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32 + util.status.check_ok %status, "failed to wait on timepoint" + %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view +} + +// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // +module { + util.global private @__device_0 : !hal.device + util.global private @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.initializer { + %0 = util.null : !hal.executable + %c14_i32 = arith.constant 14 : i32 + %c-1 = arith.constant -1 : index + %c-1_i64 = arith.constant -1 : i64 + %c18_i32 = arith.constant 18 : i32 + %false = arith.constant false + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %1 = util.null : !hal.device + %device_count = hal.devices.count : index + cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device) + ^bb1(%2: index, %3: index, %4: !hal.device): // 2 preds: ^bb0, ^bb4 + %5 = util.cmp.eq %4, %1 : !hal.device + %6 = arith.cmpi slt, %2, %device_count : index + %7 = arith.andi %5, %6 : i1 + cf.cond_br %7, ^bb2, ^bb5 + ^bb2: // pred: ^bb1 + %device_n = hal.devices.get %2 : !hal.device + %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false + cf.cond_br %value, ^bb3, ^bb4(%false : i1) + ^bb3: // pred: ^bb2 + %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + cf.br ^bb4(%value_1 : i1) + ^bb4(%8: i1): // 2 preds: ^bb2, ^bb3 + %9 = arith.cmpi eq, %3, %c0 : index + %10 = arith.select %8, %c1, %c0 : index + %11 = arith.addi %3, %10 : index + %12 = arith.andi %8, %9 : i1 + %13 = arith.select %12, %device_n, %1 : !hal.device + %14 = arith.addi %2, %c1 : index + cf.br ^bb1(%14, %11, %13 : index, index, !hal.device) + ^bb5: // pred: ^bb1 + cf.cond_br %5, ^bb6, ^bb7 + ^bb6: // pred: ^bb5 + util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>" + cf.br ^bb7 + ^bb7: // 2 preds: ^bb5, ^bb6 + %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + %15 = arith.select %value_3, %c0, %c-1 : index + %16 = arith.cmpi eq, %15, %c0 : index + util.global.store %4, @__device_0 : !hal.device + cf.cond_br %16, ^bb8, ^bb9 + ^bb8: // pred: ^bb7 + %executable = hal.executable.create device(%4 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable + cf.br ^bb10(%executable : !hal.executable) + ^bb9: // pred: ^bb7 + util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + cf.br ^bb10(%0 : !hal.executable) + ^bb10(%17: !hal.executable): // 2 preds: ^bb8, ^bb9 + util.global.store %17, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + %18 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer + util.global.store %18, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.return + } + hal.executable private @multiple_results_dispatch_0 { + hal.executable.binary public @embedded_elf_arm_64 attributes {data = dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8>, format = "embedded-elf-arm_64", mime_type = "application/x-elf"} + } + util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} { + %c1 = arith.constant 1 : index + %c64_i32 = arith.constant 64 : i32 + %c128 = arith.constant 128 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %c0 = arith.constant 0 : index + %c0_i32 = arith.constant 0 : i32 + %c3 = arith.constant 3 : index + %c-1_i64 = arith.constant -1 : i64 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([ + (%c0 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([ + (%c1 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") + hal.command_buffer.finalize<%cmd : !hal.command_buffer> + util.return %cmd : !hal.command_buffer + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32 + %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32 + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c64 = arith.constant 64 : index + %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index + %c-1_i64 = arith.constant -1 : i64 + %0 = util.null : !hal.fence + %c0_i64 = arith.constant 0 : i64 + %c-1_i32 = arith.constant -1 : i32 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer + %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator + hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer + hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128} + %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%__multiple_results_memoize_result_0_device_0) bindings([ + (%buffer : !hal.buffer)[%c0, %c8], + (%buffer_0 : !hal.buffer)[%c0, %c8], + (%transient_buffer : !hal.buffer)[%c0, %c128] + ]) flags("None") + %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32 + util.status.check_ok %status, "failed to wait on timepoint" + %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- // +module { + util.global private @__device_0 : !hal.device + util.global private @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + util.global private @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.initializer { + %0 = util.null : !hal.executable + %c14_i32 = arith.constant 14 : i32 + %c-1 = arith.constant -1 : index + %c-1_i64 = arith.constant -1 : i64 + %c18_i32 = arith.constant 18 : i32 + %false = arith.constant false + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %1 = util.null : !hal.device + %device_count = hal.devices.count : index + cf.br ^bb1(%c0, %c0, %1 : index, index, !hal.device) + ^bb1(%2: index, %3: index, %4: !hal.device): // 2 preds: ^bb0, ^bb4 + %5 = util.cmp.eq %4, %1 : !hal.device + %6 = arith.cmpi slt, %2, %device_count : index + %7 = arith.andi %5, %6 : i1 + cf.cond_br %7, ^bb2, ^bb5 + ^bb2: // pred: ^bb1 + %device_n = hal.devices.get %2 : !hal.device + %ok, %value = hal.device.query<%device_n : !hal.device> key("hal.device.id" :: "local*") : i1, i1 = false + cf.cond_br %value, ^bb3, ^bb4(%false : i1) + ^bb3: // pred: ^bb2 + %ok_0, %value_1 = hal.device.query<%device_n : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + cf.br ^bb4(%value_1 : i1) + ^bb4(%8: i1): // 2 preds: ^bb2, ^bb3 + %9 = arith.cmpi eq, %3, %c0 : index + %10 = arith.select %8, %c1, %c0 : index + %11 = arith.addi %3, %10 : index + %12 = arith.andi %8, %9 : i1 + %13 = arith.select %12, %device_n, %1 : !hal.device + %14 = arith.addi %2, %c1 : index + cf.br ^bb1(%14, %11, %13 : index, index, !hal.device) + ^bb5: // pred: ^bb1 + cf.cond_br %5, ^bb6, ^bb7 + ^bb6: // pred: ^bb5 + util.status.check_ok %c18_i32, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>" + cf.br ^bb7 + ^bb7: // 2 preds: ^bb5, ^bb6 + %ok_2, %value_3 = hal.device.query<%4 : !hal.device> key("hal.executable.format" :: "embedded-elf-arm_64") : i1, i1 = false + %15 = arith.select %value_3, %c0, %c-1 : index + %16 = arith.cmpi eq, %15, %c0 : index + util.global.store %4, @__device_0 : !hal.device + cf.cond_br %16, ^bb8, ^bb9 + ^bb8: // pred: ^bb7 + %executable = hal.executable.create device(%4 : !hal.device) affinity(%c-1_i64) target(@multiple_results_dispatch_0::@embedded_elf_arm_64) : !hal.executable + cf.br ^bb10(%executable : !hal.executable) + ^bb9: // pred: ^bb7 + util.status.check_ok %c14_i32, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + cf.br ^bb10(%0 : !hal.executable) + ^bb10(%17: !hal.executable): // 2 preds: ^bb8, ^bb9 + util.global.store %17, @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + %18 = util.call @__multiple_results_memoize_apply() : () -> !hal.command_buffer + util.global.store %18, @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + util.return + } + hal.executable private @multiple_results_dispatch_0 { + hal.executable.binary public @embedded_elf_arm_64 attributes {data = dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8>, format = "embedded-elf-arm_64", mime_type = "application/x-elf"} + } + util.func private @__multiple_results_memoize_apply() -> !hal.command_buffer attributes {inlining_policy = #util.inline.never} { + %c1 = arith.constant 1 : index + %c64_i32 = arith.constant 64 : i32 + %c128 = arith.constant 128 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %c0 = arith.constant 0 : index + %c0_i32 = arith.constant 0 : i32 + %c3 = arith.constant 3 : index + %c-1_i64 = arith.constant -1 : i64 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %__device_0_executable_0_multiple_results_dispatch_0 = util.global.load immutable @__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable + %cmd = hal.command_buffer.create device(%__device_0 : !hal.device) mode("None") categories("Transfer|Dispatch") affinity(%c-1_i64) bindings(%c3) : !hal.command_buffer + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c0_i32]) bindings([ + (%c0 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%__device_0_executable_0_multiple_results_dispatch_0 : !hal.executable)[%c0] workgroups([%c1, %c1, %c1]) constants([%c64_i32]) bindings([ + (%c1 : index)[%c0, %c8], + (%c2 : index)[%c0, %c128] + ]) flags("None") + hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") + hal.command_buffer.finalize<%cmd : !hal.command_buffer> + util.return %cmd : !hal.command_buffer + } + util.func public @multiple_results(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %buffer_usage = hal.buffer_usage<"TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage"> : i32 + %memory_type = hal.memory_type<"DeviceVisible|DeviceLocal"> : i32 + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c64 = arith.constant 64 : index + %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index + %c-1_i64 = arith.constant -1 : i64 + %0 = util.null : !hal.fence + %c0_i64 = arith.constant 0 : i64 + %c-1_i32 = arith.constant -1 : i32 + %__device_0 = util.global.load immutable @__device_0 : !hal.device + %__multiple_results_memoize_result_0_device_0 = util.global.load immutable @__multiple_results_memoize_result_0_device_0 : !hal.command_buffer + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer + %allocator = hal.device.allocator<%__device_0 : !hal.device> : !hal.allocator + hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) + %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer + hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c8) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") + %fence = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + %transient_buffer = hal.device.queue.alloca<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) pool(%c0_i64) type(%memory_type) usage(%buffer_usage) flags("None") : !hal.buffer{%c128} + %fence_1 = hal.fence.create device(%__device_0 : !hal.device) flags("None") : !hal.fence + hal.device.queue.execute.indirect<%__device_0 : !hal.device> affinity(%c-1_i64) wait(%fence) signal(%fence_1) commands(%__multiple_results_memoize_result_0_device_0) bindings([ + (%buffer : !hal.buffer)[%c0, %c8], + (%buffer_0 : !hal.buffer)[%c0, %c8], + (%transient_buffer : !hal.buffer)[%c0, %c128] + ]) flags("None") + %status = hal.fence.await until([%fence_1]) timeout_millis(%c-1_i32) flags("None") : i32 + util.status.check_ok %status, "failed to wait on timepoint" + %view = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c0, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + %view_2 = hal.buffer_view.create buffer(%transient_buffer : !hal.buffer)[%c64, %c8] shape([%c2]) type(%element_type_f32) encoding(%dense_row_major) : !hal.buffer_view + util.return %view, %view_2 : !hal.buffer_view, !hal.buffer_view + } +} + + +// -----// IR Dump After ConversionPass (iree-vm-conversion) //----- // +module attributes {vm.toplevel} { + vm.module public @module { + vm.global.ref private @__device_0 : !vm.ref + vm.global.ref private @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref + vm.global.ref private @__multiple_results_memoize_result_0_device_0 : !vm.ref + vm.initializer { + %null = vm.const.ref.zero : !vm.ref + %c14 = vm.const.i32 14 + %c-1 = vm.const.i64 -1 + %c-1_0 = vm.const.i64 -1 + %c18 = vm.const.i32 18 + %zero = vm.const.i32.zero + %zero_1 = vm.const.i64.zero + %c1 = vm.const.i64 1 + %null_2 = vm.const.ref.zero : !vm.ref + %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 + %1 = vm.ext.i32.i64.s %0 : i32 -> i64 + vm.br ^bb1(%zero_1, %zero_1, %null_2 : i64, i64, !vm.ref) + ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 + %req = vm.cmp.eq.ref %4, %null_2 : !vm.ref + %slt = vm.cmp.lt.i64.s %2, %1 : i64 + %5 = vm.and.i32 %req, %slt : i32 + vm.cond_br %5, ^bb2, ^bb5 + ^bb2: // pred: ^bb1 + %6 = vm.trunc.i64.i32 %2 : i64 -> i32 + %ref = vm.call @hal.devices.get(%6) {nosideeffects} : (i32) -> !vm.ref + %buffer = vm.rodata.inline "_utf8_hal_device_id_C6650FF277232B5A" {alignment = 1 : i64} : !vm.buffer = "hal.device.id" + %buffer_3 = vm.rodata.inline "_utf8_local_1A8FF0278D7661D8" {alignment = 1 : i64} : !vm.buffer = "local*" + %7:2 = vm.call @hal.device.query.i64(%ref, %buffer, %buffer_3) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) + %nz = vm.cmp.nz.i64 %7#1 : i64 + %zero_4 = vm.const.i32.zero + %8 = vm.select.i32 %7#0, %nz, %zero_4 : i32 + %c1_5 = vm.const.i32 1 + vm.cond_br %8, ^bb3, ^bb4(%zero : i32) + ^bb3: // pred: ^bb2 + %buffer_6 = vm.rodata.inline "_utf8_hal_executable_format_E03EECB63A2AAF52" {alignment = 1 : i64} : !vm.buffer = "hal.executable.format" + %buffer_7 = vm.rodata.inline "_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5" {alignment = 1 : i64} : !vm.buffer = "embedded-elf-arm_64" + %9:2 = vm.call @hal.device.query.i64(%ref, %buffer_6, %buffer_7) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) + %nz_8 = vm.cmp.nz.i64 %9#1 : i64 + %zero_9 = vm.const.i32.zero + %10 = vm.select.i32 %9#0, %nz_8, %zero_9 : i32 + %c1_10 = vm.const.i32 1 + vm.br ^bb4(%10 : i32) + ^bb4(%11: i32): // 2 preds: ^bb2, ^bb3 + %eq = vm.cmp.eq.i64 %3, %zero_1 : i64 + %12 = vm.select.i64 %11, %c1, %zero_1 : i64 + %13 = vm.add.i64 %3, %12 : i64 + %14 = vm.and.i32 %11, %eq : i32 + %ref_11 = vm.select.ref %14, %ref, %null_2 : !vm.ref + %15 = vm.add.i64 %2, %c1 : i64 + vm.br ^bb1(%15, %13, %ref_11 : i64, i64, !vm.ref) + ^bb5: // pred: ^bb1 + vm.cond_br %req, ^bb6, ^bb7 + ^bb6: // pred: ^bb5 + vm.cond_fail %c18, "HAL device `__device_0` not found or unavailable: #hal.device.target<"local", [#hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>]>" + vm.br ^bb7 + ^bb7: // 2 preds: ^bb5, ^bb6 + %buffer_12 = vm.rodata.inline "_utf8_hal_executable_format_E03EECB63A2AAF52" {alignment = 1 : i64} : !vm.buffer = "hal.executable.format" + %buffer_13 = vm.rodata.inline "_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5" {alignment = 1 : i64} : !vm.buffer = "embedded-elf-arm_64" + %16:2 = vm.call @hal.device.query.i64(%4, %buffer_12, %buffer_13) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) + %nz_14 = vm.cmp.nz.i64 %16#1 : i64 + %zero_15 = vm.const.i32.zero + %17 = vm.select.i32 %16#0, %nz_14, %zero_15 : i32 + %c1_16 = vm.const.i32 1 + %18 = vm.select.i64 %17, %zero_1, %c-1 : i64 + %eq_17 = vm.cmp.eq.i64 %18, %zero_1 : i64 + vm.global.store.ref %4, @__device_0 : !vm.ref + vm.cond_br %eq_17, ^bb8, ^bb9 + ^bb8: // pred: ^bb7 + %buffer_18 = vm.rodata.inline "multiple_results_dispatch_0_embedded_elf_arm_64" {alignment = 16 : i64, mime_type = "application/x-elf"} : !vm.buffer = dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8> + %buffer_19 = vm.rodata.inline "_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5" {alignment = 1 : i64} : !vm.buffer = "embedded-elf-arm_64" + %null_20 = vm.const.ref.zero : !vm.buffer + %ref_21 = vm.call @hal.executable.create(%4, %c-1_0, %buffer_19, %buffer_18, %null_20) {nosideeffects} : (!vm.ref, i64, !vm.buffer, !vm.buffer, !vm.buffer) -> !vm.ref + vm.br ^bb10(%ref_21 : !vm.ref) + ^bb9: // pred: ^bb7 + vm.cond_fail %c14, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + vm.br ^bb10(%null : !vm.ref) + ^bb10(%19: !vm.ref): // 2 preds: ^bb8, ^bb9 + vm.global.store.ref %19, @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref + %ref_22 = vm.call @__multiple_results_memoize_apply() : () -> !vm.ref + vm.global.store.ref %ref_22, @__multiple_results_memoize_result_0_device_0 : !vm.ref + vm.return + } + vm.func private @__multiple_results_memoize_apply() -> !vm.ref attributes {inlining_policy = #util.inline.never} { + %c1 = vm.const.i64 1 + %c64 = vm.const.i32 64 + %c128 = vm.const.i64 128 + %c8 = vm.const.i64 8 + %c2 = vm.const.i64 2 + %zero = vm.const.i64.zero + %zero_0 = vm.const.i32.zero + %c3 = vm.const.i64 3 + %c-1 = vm.const.i64 -1 + %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref + %__device_0_executable_0_multiple_results_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref + %zero_1 = vm.const.i32.zero + %c3_2 = vm.const.i32 3 + %c3_3 = vm.const.i32 3 + %ref = vm.call @hal.command_buffer.create(%__device_0, %zero_1, %c3_2, %c-1, %c3_3) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref + %zero_4 = vm.const.i32.zero + %zero_5 = vm.const.i32.zero + %c1_6 = vm.const.i32 1 + %c1_7 = vm.const.i32 1 + %c1_8 = vm.const.i32 1 + %zero_9 = vm.const.i64 0 + %zero_10 = vm.const.i32.zero + %null = vm.const.ref.zero : !vm.ref + %c2_11 = vm.const.i32 2 + %null_12 = vm.const.ref.zero : !vm.ref + vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_5, %c1_6, %c1_7, %c1_8, %zero_9, [%zero_0], [(%zero_4, %zero_10, %null, %zero, %c8), (%zero_4, %c2_11, %null_12, %zero, %c128)]) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64, i32 ..., tuple, i64, i64> ...) + %zero_13 = vm.const.i32.zero + %zero_14 = vm.const.i32.zero + %c1_15 = vm.const.i32 1 + %c1_16 = vm.const.i32 1 + %c1_17 = vm.const.i32 1 + %zero_18 = vm.const.i64 0 + %c1_19 = vm.const.i32 1 + %null_20 = vm.const.ref.zero : !vm.ref + %c2_21 = vm.const.i32 2 + %null_22 = vm.const.ref.zero : !vm.ref + vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_14, %c1_15, %c1_16, %c1_17, %zero_18, [%c64], [(%zero_13, %c1_19, %null_20, %zero, %c8), (%zero_13, %c2_21, %null_22, %zero, %c128)]) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64, i32 ..., tuple, i64, i64> ...) + %c28 = vm.const.i32 28 + %c13 = vm.const.i32 13 + %zero_23 = vm.const.i64.zero + vm.call @hal.command_buffer.execution_barrier(%ref, %c28, %c13, %zero_23) : (!vm.ref, i32, i32, i64) -> () + vm.call @hal.command_buffer.finalize(%ref) : (!vm.ref) -> () + vm.return %ref : !vm.ref + } + vm.import private @hal.ex.file.from_memory(%device : !vm.ref, %queue_affinity : i64, %access : i32, %buffer : !vm.buffer, %offset : i64, %length : i64, %flags : i32) -> !vm.ref + vm.import private @hal.allocator.select(%memory_types : i32, %buffer_usage : i32, %flags : i64, %from : tuple, i64> ...) -> (!vm.ref, i64) attributes {nosideeffects} + vm.import private @hal.allocator.allocate(%allocator : !vm.ref, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref + vm.import private @hal.allocator.import(%allocator : !vm.ref, %try : i32, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %source : !vm.buffer, %offset : i64, %length : i64) -> !vm.ref + vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) + vm.import private @hal.buffer.allocation.preserve(%buffer : !vm.ref) + vm.import private @hal.buffer.allocation.discard(%buffer : !vm.ref) -> i32 + vm.import private @hal.buffer.allocation.is_terminal(%buffer : !vm.ref) -> i32 + vm.import private @hal.buffer.subspan(%source_buffer : !vm.ref, %source_offset : i64, %length : i64) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.buffer.length(%buffer : !vm.ref) -> i64 attributes {nosideeffects} + vm.import private @hal.buffer.load(%source_buffer : !vm.ref, %source_offset : i64, %length : i32) -> i32 + vm.import private @hal.buffer.store(%value : i32, %target_buffer : !vm.ref, %target_offset : i64, %length : i32) + vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) + vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.buffer_view.element_type(%buffer_view : !vm.ref) -> i32 attributes {nosideeffects} + vm.import private @hal.buffer_view.encoding_type(%buffer_view : !vm.ref) -> i32 attributes {nosideeffects} + vm.import private @hal.buffer_view.rank(%buffer_view : !vm.ref) -> i32 attributes {nosideeffects} + vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref, %index : i32) -> i64 attributes {nosideeffects} + vm.import private @hal.buffer_view.trace(%key : !vm.buffer, %operands : !vm.ref ...) + vm.import private @hal.channel.create(%device : !vm.ref, %queue_affinity : i64, %flags : i64, %id : !vm.buffer, %group : !vm.buffer, %rank : i32, %count : i32) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.channel.split(%channel : !vm.ref, %color : i32, %key : i32, %flags : i64) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.channel.rank_and_count(%channel : !vm.ref) -> (i32, i32) attributes {nosideeffects} + vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 6 : i32} + vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) + vm.import private @hal.command_buffer.begin_debug_group(%command_buffer : !vm.ref, %label : !vm.buffer) + vm.import private @hal.command_buffer.end_debug_group(%command_buffer : !vm.ref) + vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i64) + vm.import private @hal.command_buffer.advise_buffer(%command_buffer : !vm.ref, %buffer : !vm.ref, %flags : i64, %arg0 : i64, %arg1 : i64, %buffer_slot : i32) + vm.import private @hal.command_buffer.fill_buffer(%command_buffer : !vm.ref, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %target_buffer_slot : i32, %pattern : i64, %pattern_length : i32, %flags : i64) + vm.import private @hal.command_buffer.update_buffer(%command_buffer : !vm.ref, %source_buffer : !vm.buffer, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %target_buffer_slot : i32, %flags : i64) + vm.import private @hal.command_buffer.copy_buffer(%command_buffer : !vm.ref, %source_buffer_slot : i32, %target_buffer_slot : i32, %source_buffer : !vm.ref, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %flags : i64) + vm.import private @hal.command_buffer.collective(%command_buffer : !vm.ref, %channel : !vm.ref, %op : i32, %param : i32, %send_buffer_slot : i32, %recv_buffer_slot : i32, %send_buffer : !vm.ref, %recv_buffer : !vm.ref, %send_offset : i64, %send_length : i64, %recv_offset : i64, %recv_length : i64, %element_count : i64) + vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple, i64, i64> ...) + vm.import private @hal.command_buffer.dispatch.indirect(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref, %workgroups_offset : i64, %flags : i64, %constants : i32 ..., %bindings : tuple, i64, i64> ...) + vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects} + vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64, %flags : i64) -> !vm.ref + vm.import private @hal.device.queue.dealloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %buffer : !vm.ref, %flags : i64) + vm.import private @hal.device.queue.fill(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %pattern : i64, %pattern_length : i32, %flags : i64) + vm.import private @hal.device.queue.update(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %source_buffer : !vm.buffer, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %flags : i64) + vm.import private @hal.device.queue.copy(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %source_buffer : !vm.ref, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %flags : i64) + vm.import private @hal.device.queue.read(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %source_file : !vm.ref, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %flags : i64) + vm.import private @hal.device.queue.write(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %source_buffer : !vm.ref, %source_offset : i64, %target_file : !vm.ref, %target_offset : i64, %length : i64, %flags : i64) + vm.import private @hal.device.queue.barrier(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %flags : i64) + vm.import private @hal.device.queue.execute(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffer : !vm.ref, %flags : i64) + vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffer : !vm.ref, %flags : i64, %binding_table : tuple, i64, i64> ...) + vm.import private @hal.device.queue.flush(%device : !vm.ref, %queue_affinity : i64) + vm.import private @hal.devices.count() -> i32 attributes {nosideeffects} + vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.executable.create(%device : !vm.ref, %queue_affinity : i64, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.fence.create(%device : !vm.ref, %flags : i64) -> !vm.ref + vm.import private @hal.fence.join(%flags : i64, %fences : !vm.ref ...) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.fence.query(%fence : !vm.ref) -> i32 + vm.import private @hal.fence.signal(%fence : !vm.ref) + vm.import private @hal.fence.fail(%fence : !vm.ref, %status : i32) + vm.import private @hal.fence.await(%timeout_millis : i32, %flags : i64, %fences : !vm.ref ...) -> i32 attributes {vm.yield} + vm.func private @multiple_results(%arg0: !vm.ref, %arg1: !vm.ref) -> (!vm.ref, !vm.ref) attributes {iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c3075 = vm.const.i32 3075 + %c48 = vm.const.i32 48 + %c2 = vm.const.i64 2 + %c8 = vm.const.i64 8 + %c64 = vm.const.i64 64 + %c128 = vm.const.i64 128 + %zero = vm.const.i64.zero + %c-1 = vm.const.i64 -1 + %null = vm.const.ref.zero : !vm.ref + %zero_0 = vm.const.i64.zero + %c-1_1 = vm.const.i32 -1 + %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref + %__multiple_results_memoize_result_0_device_0 = vm.global.load.ref immutable @__multiple_results_memoize_result_0_device_0 : !vm.ref + %c553648160 = vm.const.i32 553648160 + %c1 = vm.const.i32 1 + %buffer = vm.rodata.inline "_utf8_input0_DCE99660CEB3F6B" {alignment = 1 : i64} : !vm.buffer = "input0" + vm.call.variadic @hal.buffer_view.assert(%arg0, %buffer, %c553648160, %c1, [%c2]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) + %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref + %ref_2 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref + %buffer_3 = vm.rodata.inline "_utf8_tensor_FC1814BC4A58F22A" {alignment = 1 : i64} : !vm.buffer = "tensor" + %c16 = vm.const.i32 16 + %c3075_4 = vm.const.i32 3075 + vm.call @hal.buffer.assert(%ref, %buffer_3, %ref_2, %c8, %c16, %c3075_4) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () + %buffer_5 = vm.rodata.inline "_utf8_input1_B898B726583C85DA" {alignment = 1 : i64} : !vm.buffer = "input1" + vm.call.variadic @hal.buffer_view.assert(%arg1, %buffer_5, %c553648160, %c1, [%c2]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) + %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref + %buffer_7 = vm.rodata.inline "_utf8_tensor_FC1814BC4A58F22A" {alignment = 1 : i64} : !vm.buffer = "tensor" + %c16_8 = vm.const.i32 16 + %c3075_9 = vm.const.i32 3075 + vm.call @hal.buffer.assert(%ref_6, %buffer_7, %ref_2, %c8, %c16_8, %c3075_9) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () + %zero_10 = vm.const.i64.zero + %ref_11 = vm.call @hal.fence.create(%__device_0, %zero_10) : (!vm.ref, i64) -> !vm.ref + %zero_12 = vm.const.i64.zero + %ref_13 = vm.call @hal.device.queue.alloca(%__device_0, %c-1, %null, %ref_11, %zero_0, %c48, %c3075, %c128, %zero_12) : (!vm.ref, i64, !vm.ref, !vm.ref, i64, i32, i32, i64, i64) -> !vm.ref + %zero_14 = vm.const.i64.zero + %ref_15 = vm.call @hal.fence.create(%__device_0, %zero_14) : (!vm.ref, i64) -> !vm.ref + %zero_16 = vm.const.i64 0 + vm.call.variadic @hal.device.queue.execute.indirect(%__device_0, %c-1, %ref_11, %ref_15, %__multiple_results_memoize_result_0_device_0, %zero_16, [(%ref, %zero, %c8), (%ref_6, %zero, %c8), (%ref_13, %zero, %c128)]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref, i64, tuple, i64, i64> ...) + %zero_17 = vm.const.i64.zero + %0 = vm.call.variadic @hal.fence.await(%c-1_1, %zero_17, [%ref_15]) : (i32, i64, !vm.ref ...) -> i32 + vm.cond_fail %0, "failed to wait on timepoint" + %ref_18 = vm.call.variadic @hal.buffer_view.create(%ref_13, %zero, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref + %ref_19 = vm.call.variadic @hal.buffer_view.create(%ref_13, %c64, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref + vm.return %ref_18, %ref_19 : !vm.ref, !vm.ref + } + vm.export @multiple_results attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} + } +} + + +// -----// IR Dump After ReifyRodataTablesPass (iree-vm-reify-rodata-tables) //----- // +vm.module public @module { + vm.global.ref private @__device_0 : !vm.ref + vm.global.ref private @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref + vm.global.ref private @__multiple_results_memoize_result_0_device_0 : !vm.ref + vm.initializer { + %null = vm.const.ref.zero : !vm.ref + %c14 = vm.const.i32 14 + %c-1 = vm.const.i64 -1 + %c-1_0 = vm.const.i64 -1 + %c18 = vm.const.i32 18 + %zero = vm.const.i32.zero + %zero_1 = vm.const.i64.zero + %c1 = vm.const.i64 1 + %null_2 = vm.const.ref.zero : !vm.ref + %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 + %1 = vm.ext.i32.i64.s %0 : i32 -> i64 + vm.br ^bb1(%zero_1, %zero_1, %null_2 : i64, i64, !vm.ref) + ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 + %req = vm.cmp.eq.ref %4, %null_2 : !vm.ref + %slt = vm.cmp.lt.i64.s %2, %1 : i64 + %5 = vm.and.i32 %req, %slt : i32 + vm.cond_br %5, ^bb2, ^bb5 + ^bb2: // pred: ^bb1 + %6 = vm.trunc.i64.i32 %2 : i64 -> i32 + %ref = vm.call @hal.devices.get(%6) {nosideeffects} : (i32) -> !vm.ref + %buffer = vm.rodata.inline "_utf8_hal_device_id_C6650FF277232B5A" {alignment = 1 : i64} : !vm.buffer = "hal.device.id" + %buffer_3 = vm.rodata.inline "_utf8_local_1A8FF0278D7661D8" {alignment = 1 : i64} : !vm.buffer = "local*" + %7:2 = vm.call @hal.device.query.i64(%ref, %buffer, %buffer_3) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) + %nz = vm.cmp.nz.i64 %7#1 : i64 + %zero_4 = vm.const.i32.zero + %8 = vm.select.i32 %7#0, %nz, %zero_4 : i32 + %c1_5 = vm.const.i32 1 + vm.cond_br %8, ^bb3, ^bb4(%zero : i32) + ^bb3: // pred: ^bb2 + %buffer_6 = vm.rodata.inline "_utf8_hal_executable_format_E03EECB63A2AAF52" {alignment = 1 : i64} : !vm.buffer = "hal.executable.format" + %buffer_7 = vm.rodata.inline "_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5" {alignment = 1 : i64} : !vm.buffer = "embedded-elf-arm_64" + %9:2 = vm.call @hal.device.query.i64(%ref, %buffer_6, %buffer_7) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) + %nz_8 = vm.cmp.nz.i64 %9#1 : i64 + %zero_9 = vm.const.i32.zero + %10 = vm.select.i32 %9#0, %nz_8, %zero_9 : i32 + %c1_10 = vm.const.i32 1 + vm.br ^bb4(%10 : i32) + ^bb4(%11: i32): // 2 preds: ^bb2, ^bb3 + %eq = vm.cmp.eq.i64 %3, %zero_1 : i64 + %12 = vm.select.i64 %11, %c1, %zero_1 : i64 + %13 = vm.add.i64 %3, %12 : i64 + %14 = vm.and.i32 %11, %eq : i32 + %ref_11 = vm.select.ref %14, %ref, %null_2 : !vm.ref + %15 = vm.add.i64 %2, %c1 : i64 + vm.br ^bb1(%15, %13, %ref_11 : i64, i64, !vm.ref) + ^bb5: // pred: ^bb1 + vm.cond_br %req, ^bb6, ^bb7 + ^bb6: // pred: ^bb5 + vm.cond_fail %c18, "HAL device `__device_0` not found or unavailable: #hal.device.target<"local", [#hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>]>" + vm.br ^bb7 + ^bb7: // 2 preds: ^bb5, ^bb6 + %buffer_12 = vm.rodata.inline "_utf8_hal_executable_format_E03EECB63A2AAF52" {alignment = 1 : i64} : !vm.buffer = "hal.executable.format" + %buffer_13 = vm.rodata.inline "_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5" {alignment = 1 : i64} : !vm.buffer = "embedded-elf-arm_64" + %16:2 = vm.call @hal.device.query.i64(%4, %buffer_12, %buffer_13) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) + %nz_14 = vm.cmp.nz.i64 %16#1 : i64 + %zero_15 = vm.const.i32.zero + %17 = vm.select.i32 %16#0, %nz_14, %zero_15 : i32 + %c1_16 = vm.const.i32 1 + %18 = vm.select.i64 %17, %zero_1, %c-1 : i64 + %eq_17 = vm.cmp.eq.i64 %18, %zero_1 : i64 + vm.global.store.ref %4, @__device_0 : !vm.ref + vm.cond_br %eq_17, ^bb8, ^bb9 + ^bb8: // pred: ^bb7 + %buffer_18 = vm.rodata.inline "multiple_results_dispatch_0_embedded_elf_arm_64" {alignment = 16 : i64, mime_type = "application/x-elf"} : !vm.buffer = dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8> + %buffer_19 = vm.rodata.inline "_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5" {alignment = 1 : i64} : !vm.buffer = "embedded-elf-arm_64" + %null_20 = vm.const.ref.zero : !vm.buffer + %ref_21 = vm.call @hal.executable.create(%4, %c-1_0, %buffer_19, %buffer_18, %null_20) {nosideeffects} : (!vm.ref, i64, !vm.buffer, !vm.buffer, !vm.buffer) -> !vm.ref + vm.br ^bb10(%ref_21 : !vm.ref) + ^bb9: // pred: ^bb7 + vm.cond_fail %c14, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + vm.br ^bb10(%null : !vm.ref) + ^bb10(%19: !vm.ref): // 2 preds: ^bb8, ^bb9 + vm.global.store.ref %19, @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref + %ref_22 = vm.call @__multiple_results_memoize_apply() : () -> !vm.ref + vm.global.store.ref %ref_22, @__multiple_results_memoize_result_0_device_0 : !vm.ref + vm.return + } + vm.func private @__multiple_results_memoize_apply() -> !vm.ref attributes {inlining_policy = #util.inline.never} { + %c1 = vm.const.i64 1 + %c64 = vm.const.i32 64 + %c128 = vm.const.i64 128 + %c8 = vm.const.i64 8 + %c2 = vm.const.i64 2 + %zero = vm.const.i64.zero + %zero_0 = vm.const.i32.zero + %c3 = vm.const.i64 3 + %c-1 = vm.const.i64 -1 + %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref + %__device_0_executable_0_multiple_results_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref + %zero_1 = vm.const.i32.zero + %c3_2 = vm.const.i32 3 + %c3_3 = vm.const.i32 3 + %ref = vm.call @hal.command_buffer.create(%__device_0, %zero_1, %c3_2, %c-1, %c3_3) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref + %zero_4 = vm.const.i32.zero + %zero_5 = vm.const.i32.zero + %c1_6 = vm.const.i32 1 + %c1_7 = vm.const.i32 1 + %c1_8 = vm.const.i32 1 + %zero_9 = vm.const.i64 0 + %zero_10 = vm.const.i32.zero + %null = vm.const.ref.zero : !vm.ref + %c2_11 = vm.const.i32 2 + %null_12 = vm.const.ref.zero : !vm.ref + vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_5, %c1_6, %c1_7, %c1_8, %zero_9, [%zero_0], [(%zero_4, %zero_10, %null, %zero, %c8), (%zero_4, %c2_11, %null_12, %zero, %c128)]) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64, i32 ..., tuple, i64, i64> ...) + %zero_13 = vm.const.i32.zero + %zero_14 = vm.const.i32.zero + %c1_15 = vm.const.i32 1 + %c1_16 = vm.const.i32 1 + %c1_17 = vm.const.i32 1 + %zero_18 = vm.const.i64 0 + %c1_19 = vm.const.i32 1 + %null_20 = vm.const.ref.zero : !vm.ref + %c2_21 = vm.const.i32 2 + %null_22 = vm.const.ref.zero : !vm.ref + vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_14, %c1_15, %c1_16, %c1_17, %zero_18, [%c64], [(%zero_13, %c1_19, %null_20, %zero, %c8), (%zero_13, %c2_21, %null_22, %zero, %c128)]) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64, i32 ..., tuple, i64, i64> ...) + %c28 = vm.const.i32 28 + %c13 = vm.const.i32 13 + %zero_23 = vm.const.i64.zero + vm.call @hal.command_buffer.execution_barrier(%ref, %c28, %c13, %zero_23) : (!vm.ref, i32, i32, i64) -> () + vm.call @hal.command_buffer.finalize(%ref) : (!vm.ref) -> () + vm.return %ref : !vm.ref + } + vm.import private @hal.ex.file.from_memory(%device : !vm.ref, %queue_affinity : i64, %access : i32, %buffer : !vm.buffer, %offset : i64, %length : i64, %flags : i32) -> !vm.ref + vm.import private @hal.allocator.select(%memory_types : i32, %buffer_usage : i32, %flags : i64, %from : tuple, i64> ...) -> (!vm.ref, i64) attributes {nosideeffects} + vm.import private @hal.allocator.allocate(%allocator : !vm.ref, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref + vm.import private @hal.allocator.import(%allocator : !vm.ref, %try : i32, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %source : !vm.buffer, %offset : i64, %length : i64) -> !vm.ref + vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) + vm.import private @hal.buffer.allocation.preserve(%buffer : !vm.ref) + vm.import private @hal.buffer.allocation.discard(%buffer : !vm.ref) -> i32 + vm.import private @hal.buffer.allocation.is_terminal(%buffer : !vm.ref) -> i32 + vm.import private @hal.buffer.subspan(%source_buffer : !vm.ref, %source_offset : i64, %length : i64) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.buffer.length(%buffer : !vm.ref) -> i64 attributes {nosideeffects} + vm.import private @hal.buffer.load(%source_buffer : !vm.ref, %source_offset : i64, %length : i32) -> i32 + vm.import private @hal.buffer.store(%value : i32, %target_buffer : !vm.ref, %target_offset : i64, %length : i32) + vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) + vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.buffer_view.element_type(%buffer_view : !vm.ref) -> i32 attributes {nosideeffects} + vm.import private @hal.buffer_view.encoding_type(%buffer_view : !vm.ref) -> i32 attributes {nosideeffects} + vm.import private @hal.buffer_view.rank(%buffer_view : !vm.ref) -> i32 attributes {nosideeffects} + vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref, %index : i32) -> i64 attributes {nosideeffects} + vm.import private @hal.buffer_view.trace(%key : !vm.buffer, %operands : !vm.ref ...) + vm.import private @hal.channel.create(%device : !vm.ref, %queue_affinity : i64, %flags : i64, %id : !vm.buffer, %group : !vm.buffer, %rank : i32, %count : i32) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.channel.split(%channel : !vm.ref, %color : i32, %key : i32, %flags : i64) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.channel.rank_and_count(%channel : !vm.ref) -> (i32, i32) attributes {nosideeffects} + vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 6 : i32} + vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) + vm.import private @hal.command_buffer.begin_debug_group(%command_buffer : !vm.ref, %label : !vm.buffer) + vm.import private @hal.command_buffer.end_debug_group(%command_buffer : !vm.ref) + vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i64) + vm.import private @hal.command_buffer.advise_buffer(%command_buffer : !vm.ref, %buffer : !vm.ref, %flags : i64, %arg0 : i64, %arg1 : i64, %buffer_slot : i32) + vm.import private @hal.command_buffer.fill_buffer(%command_buffer : !vm.ref, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %target_buffer_slot : i32, %pattern : i64, %pattern_length : i32, %flags : i64) + vm.import private @hal.command_buffer.update_buffer(%command_buffer : !vm.ref, %source_buffer : !vm.buffer, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %target_buffer_slot : i32, %flags : i64) + vm.import private @hal.command_buffer.copy_buffer(%command_buffer : !vm.ref, %source_buffer_slot : i32, %target_buffer_slot : i32, %source_buffer : !vm.ref, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %flags : i64) + vm.import private @hal.command_buffer.collective(%command_buffer : !vm.ref, %channel : !vm.ref, %op : i32, %param : i32, %send_buffer_slot : i32, %recv_buffer_slot : i32, %send_buffer : !vm.ref, %recv_buffer : !vm.ref, %send_offset : i64, %send_length : i64, %recv_offset : i64, %recv_length : i64, %element_count : i64) + vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple, i64, i64> ...) + vm.import private @hal.command_buffer.dispatch.indirect(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref, %workgroups_offset : i64, %flags : i64, %constants : i32 ..., %bindings : tuple, i64, i64> ...) + vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects} + vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64, %flags : i64) -> !vm.ref + vm.import private @hal.device.queue.dealloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %buffer : !vm.ref, %flags : i64) + vm.import private @hal.device.queue.fill(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %pattern : i64, %pattern_length : i32, %flags : i64) + vm.import private @hal.device.queue.update(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %source_buffer : !vm.buffer, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %flags : i64) + vm.import private @hal.device.queue.copy(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %source_buffer : !vm.ref, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %flags : i64) + vm.import private @hal.device.queue.read(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %source_file : !vm.ref, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %flags : i64) + vm.import private @hal.device.queue.write(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %source_buffer : !vm.ref, %source_offset : i64, %target_file : !vm.ref, %target_offset : i64, %length : i64, %flags : i64) + vm.import private @hal.device.queue.barrier(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %flags : i64) + vm.import private @hal.device.queue.execute(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffer : !vm.ref, %flags : i64) + vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffer : !vm.ref, %flags : i64, %binding_table : tuple, i64, i64> ...) + vm.import private @hal.device.queue.flush(%device : !vm.ref, %queue_affinity : i64) + vm.import private @hal.devices.count() -> i32 attributes {nosideeffects} + vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.executable.create(%device : !vm.ref, %queue_affinity : i64, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.fence.create(%device : !vm.ref, %flags : i64) -> !vm.ref + vm.import private @hal.fence.join(%flags : i64, %fences : !vm.ref ...) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.fence.query(%fence : !vm.ref) -> i32 + vm.import private @hal.fence.signal(%fence : !vm.ref) + vm.import private @hal.fence.fail(%fence : !vm.ref, %status : i32) + vm.import private @hal.fence.await(%timeout_millis : i32, %flags : i64, %fences : !vm.ref ...) -> i32 attributes {vm.yield} + vm.func private @multiple_results(%arg0: !vm.ref, %arg1: !vm.ref) -> (!vm.ref, !vm.ref) attributes {iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c3075 = vm.const.i32 3075 + %c48 = vm.const.i32 48 + %c2 = vm.const.i64 2 + %c8 = vm.const.i64 8 + %c64 = vm.const.i64 64 + %c128 = vm.const.i64 128 + %zero = vm.const.i64.zero + %c-1 = vm.const.i64 -1 + %null = vm.const.ref.zero : !vm.ref + %zero_0 = vm.const.i64.zero + %c-1_1 = vm.const.i32 -1 + %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref + %__multiple_results_memoize_result_0_device_0 = vm.global.load.ref immutable @__multiple_results_memoize_result_0_device_0 : !vm.ref + %c553648160 = vm.const.i32 553648160 + %c1 = vm.const.i32 1 + %buffer = vm.rodata.inline "_utf8_input0_DCE99660CEB3F6B" {alignment = 1 : i64} : !vm.buffer = "input0" + vm.call.variadic @hal.buffer_view.assert(%arg0, %buffer, %c553648160, %c1, [%c2]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) + %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref + %ref_2 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref + %buffer_3 = vm.rodata.inline "_utf8_tensor_FC1814BC4A58F22A" {alignment = 1 : i64} : !vm.buffer = "tensor" + %c16 = vm.const.i32 16 + %c3075_4 = vm.const.i32 3075 + vm.call @hal.buffer.assert(%ref, %buffer_3, %ref_2, %c8, %c16, %c3075_4) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () + %buffer_5 = vm.rodata.inline "_utf8_input1_B898B726583C85DA" {alignment = 1 : i64} : !vm.buffer = "input1" + vm.call.variadic @hal.buffer_view.assert(%arg1, %buffer_5, %c553648160, %c1, [%c2]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) + %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref + %buffer_7 = vm.rodata.inline "_utf8_tensor_FC1814BC4A58F22A" {alignment = 1 : i64} : !vm.buffer = "tensor" + %c16_8 = vm.const.i32 16 + %c3075_9 = vm.const.i32 3075 + vm.call @hal.buffer.assert(%ref_6, %buffer_7, %ref_2, %c8, %c16_8, %c3075_9) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () + %zero_10 = vm.const.i64.zero + %ref_11 = vm.call @hal.fence.create(%__device_0, %zero_10) : (!vm.ref, i64) -> !vm.ref + %zero_12 = vm.const.i64.zero + %ref_13 = vm.call @hal.device.queue.alloca(%__device_0, %c-1, %null, %ref_11, %zero_0, %c48, %c3075, %c128, %zero_12) : (!vm.ref, i64, !vm.ref, !vm.ref, i64, i32, i32, i64, i64) -> !vm.ref + %zero_14 = vm.const.i64.zero + %ref_15 = vm.call @hal.fence.create(%__device_0, %zero_14) : (!vm.ref, i64) -> !vm.ref + %zero_16 = vm.const.i64 0 + vm.call.variadic @hal.device.queue.execute.indirect(%__device_0, %c-1, %ref_11, %ref_15, %__multiple_results_memoize_result_0_device_0, %zero_16, [(%ref, %zero, %c8), (%ref_6, %zero, %c8), (%ref_13, %zero, %c128)]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref, i64, tuple, i64, i64> ...) + %zero_17 = vm.const.i64.zero + %0 = vm.call.variadic @hal.fence.await(%c-1_1, %zero_17, [%ref_15]) : (i32, i64, !vm.ref ...) -> i32 + vm.cond_fail %0, "failed to wait on timepoint" + %ref_18 = vm.call.variadic @hal.buffer_view.create(%ref_13, %zero, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref + %ref_19 = vm.call.variadic @hal.buffer_view.create(%ref_13, %c64, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref + vm.return %ref_18, %ref_19 : !vm.ref, !vm.ref + } + vm.export @multiple_results attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} +} + +// -----// IR Dump After HoistInlinedRodataPass (iree-vm-hoist-inlined-rodata) //----- // +vm.module public @module { + vm.global.ref private @__device_0 : !vm.ref + vm.global.ref private @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref + vm.global.ref private @__multiple_results_memoize_result_0_device_0 : !vm.ref + vm.rodata private @_utf8_hal_device_id_C6650FF277232B5A {alignment = 1 : i64} "hal.device.id" + vm.rodata private @_utf8_local_1A8FF0278D7661D8 {alignment = 1 : i64} "local*" + vm.rodata private @_utf8_hal_executable_format_E03EECB63A2AAF52 {alignment = 1 : i64} "hal.executable.format" + vm.rodata private @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 {alignment = 1 : i64} "embedded-elf-arm_64" + vm.rodata private @_utf8_hal_executable_format_E03EECB63A2AAF52_0 {alignment = 1 : i64} "hal.executable.format" + vm.rodata private @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_1 {alignment = 1 : i64} "embedded-elf-arm_64" + vm.rodata private @multiple_results_dispatch_0_embedded_elf_arm_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8> + vm.rodata private @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_2 {alignment = 1 : i64} "embedded-elf-arm_64" + vm.initializer { + %null = vm.const.ref.zero : !vm.ref + %c14 = vm.const.i32 14 + %c-1 = vm.const.i64 -1 + %c-1_0 = vm.const.i64 -1 + %c18 = vm.const.i32 18 + %zero = vm.const.i32.zero + %zero_1 = vm.const.i64.zero + %c1 = vm.const.i64 1 + %null_2 = vm.const.ref.zero : !vm.ref + %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 + %1 = vm.ext.i32.i64.s %0 : i32 -> i64 + vm.br ^bb1(%zero_1, %zero_1, %null_2 : i64, i64, !vm.ref) + ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 + %req = vm.cmp.eq.ref %4, %null_2 : !vm.ref + %slt = vm.cmp.lt.i64.s %2, %1 : i64 + %5 = vm.and.i32 %req, %slt : i32 + vm.cond_br %5, ^bb2, ^bb5 + ^bb2: // pred: ^bb1 + %6 = vm.trunc.i64.i32 %2 : i64 -> i32 + %ref = vm.call @hal.devices.get(%6) {nosideeffects} : (i32) -> !vm.ref + %_utf8_hal_device_id_C6650FF277232B5A = vm.const.ref.rodata @_utf8_hal_device_id_C6650FF277232B5A : !vm.buffer + %_utf8_local_1A8FF0278D7661D8 = vm.const.ref.rodata @_utf8_local_1A8FF0278D7661D8 : !vm.buffer + %7:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_C6650FF277232B5A, %_utf8_local_1A8FF0278D7661D8) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) + %nz = vm.cmp.nz.i64 %7#1 : i64 + %zero_3 = vm.const.i32.zero + %8 = vm.select.i32 %7#0, %nz, %zero_3 : i32 + %c1_4 = vm.const.i32 1 + vm.cond_br %8, ^bb3, ^bb4(%zero : i32) + ^bb3: // pred: ^bb2 + %_utf8_hal_executable_format_E03EECB63A2AAF52 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer + %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer + %9:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_E03EECB63A2AAF52, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) + %nz_5 = vm.cmp.nz.i64 %9#1 : i64 + %zero_6 = vm.const.i32.zero + %10 = vm.select.i32 %9#0, %nz_5, %zero_6 : i32 + %c1_7 = vm.const.i32 1 + vm.br ^bb4(%10 : i32) + ^bb4(%11: i32): // 2 preds: ^bb2, ^bb3 + %eq = vm.cmp.eq.i64 %3, %zero_1 : i64 + %12 = vm.select.i64 %11, %c1, %zero_1 : i64 + %13 = vm.add.i64 %3, %12 : i64 + %14 = vm.and.i32 %11, %eq : i32 + %ref_8 = vm.select.ref %14, %ref, %null_2 : !vm.ref + %15 = vm.add.i64 %2, %c1 : i64 + vm.br ^bb1(%15, %13, %ref_8 : i64, i64, !vm.ref) + ^bb5: // pred: ^bb1 + vm.cond_br %req, ^bb6, ^bb7 + ^bb6: // pred: ^bb5 + vm.cond_fail %c18, "HAL device `__device_0` not found or unavailable: #hal.device.target<"local", [#hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>]>" + vm.br ^bb7 + ^bb7: // 2 preds: ^bb5, ^bb6 + %_utf8_hal_executable_format_E03EECB63A2AAF52_0 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52_0 : !vm.buffer + %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_1 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_1 : !vm.buffer + %16:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_E03EECB63A2AAF52_0, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_1) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) + %nz_9 = vm.cmp.nz.i64 %16#1 : i64 + %zero_10 = vm.const.i32.zero + %17 = vm.select.i32 %16#0, %nz_9, %zero_10 : i32 + %c1_11 = vm.const.i32 1 + %18 = vm.select.i64 %17, %zero_1, %c-1 : i64 + %eq_12 = vm.cmp.eq.i64 %18, %zero_1 : i64 + vm.global.store.ref %4, @__device_0 : !vm.ref + vm.cond_br %eq_12, ^bb8, ^bb9 + ^bb8: // pred: ^bb7 + %multiple_results_dispatch_0_embedded_elf_arm_64 = vm.const.ref.rodata @multiple_results_dispatch_0_embedded_elf_arm_64 : !vm.buffer + %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_2 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_2 : !vm.buffer + %null_13 = vm.const.ref.zero : !vm.buffer + %ref_14 = vm.call @hal.executable.create(%4, %c-1_0, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_2, %multiple_results_dispatch_0_embedded_elf_arm_64, %null_13) {nosideeffects} : (!vm.ref, i64, !vm.buffer, !vm.buffer, !vm.buffer) -> !vm.ref + vm.br ^bb10(%ref_14 : !vm.ref) + ^bb9: // pred: ^bb7 + vm.cond_fail %c14, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + vm.br ^bb10(%null : !vm.ref) + ^bb10(%19: !vm.ref): // 2 preds: ^bb8, ^bb9 + vm.global.store.ref %19, @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref + %ref_15 = vm.call @__multiple_results_memoize_apply() : () -> !vm.ref + vm.global.store.ref %ref_15, @__multiple_results_memoize_result_0_device_0 : !vm.ref + vm.return + } + vm.func private @__multiple_results_memoize_apply() -> !vm.ref attributes {inlining_policy = #util.inline.never} { + %c1 = vm.const.i64 1 + %c64 = vm.const.i32 64 + %c128 = vm.const.i64 128 + %c8 = vm.const.i64 8 + %c2 = vm.const.i64 2 + %zero = vm.const.i64.zero + %zero_0 = vm.const.i32.zero + %c3 = vm.const.i64 3 + %c-1 = vm.const.i64 -1 + %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref + %__device_0_executable_0_multiple_results_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref + %zero_1 = vm.const.i32.zero + %c3_2 = vm.const.i32 3 + %c3_3 = vm.const.i32 3 + %ref = vm.call @hal.command_buffer.create(%__device_0, %zero_1, %c3_2, %c-1, %c3_3) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref + %zero_4 = vm.const.i32.zero + %zero_5 = vm.const.i32.zero + %c1_6 = vm.const.i32 1 + %c1_7 = vm.const.i32 1 + %c1_8 = vm.const.i32 1 + %zero_9 = vm.const.i64 0 + %zero_10 = vm.const.i32.zero + %null = vm.const.ref.zero : !vm.ref + %c2_11 = vm.const.i32 2 + %null_12 = vm.const.ref.zero : !vm.ref + vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_5, %c1_6, %c1_7, %c1_8, %zero_9, [%zero_0], [(%zero_4, %zero_10, %null, %zero, %c8), (%zero_4, %c2_11, %null_12, %zero, %c128)]) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64, i32 ..., tuple, i64, i64> ...) + %zero_13 = vm.const.i32.zero + %zero_14 = vm.const.i32.zero + %c1_15 = vm.const.i32 1 + %c1_16 = vm.const.i32 1 + %c1_17 = vm.const.i32 1 + %zero_18 = vm.const.i64 0 + %c1_19 = vm.const.i32 1 + %null_20 = vm.const.ref.zero : !vm.ref + %c2_21 = vm.const.i32 2 + %null_22 = vm.const.ref.zero : !vm.ref + vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_14, %c1_15, %c1_16, %c1_17, %zero_18, [%c64], [(%zero_13, %c1_19, %null_20, %zero, %c8), (%zero_13, %c2_21, %null_22, %zero, %c128)]) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64, i32 ..., tuple, i64, i64> ...) + %c28 = vm.const.i32 28 + %c13 = vm.const.i32 13 + %zero_23 = vm.const.i64.zero + vm.call @hal.command_buffer.execution_barrier(%ref, %c28, %c13, %zero_23) : (!vm.ref, i32, i32, i64) -> () + vm.call @hal.command_buffer.finalize(%ref) : (!vm.ref) -> () + vm.return %ref : !vm.ref + } + vm.import private @hal.ex.file.from_memory(%device : !vm.ref, %queue_affinity : i64, %access : i32, %buffer : !vm.buffer, %offset : i64, %length : i64, %flags : i32) -> !vm.ref + vm.import private @hal.allocator.select(%memory_types : i32, %buffer_usage : i32, %flags : i64, %from : tuple, i64> ...) -> (!vm.ref, i64) attributes {nosideeffects} + vm.import private @hal.allocator.allocate(%allocator : !vm.ref, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref + vm.import private @hal.allocator.import(%allocator : !vm.ref, %try : i32, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %source : !vm.buffer, %offset : i64, %length : i64) -> !vm.ref + vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) + vm.import private @hal.buffer.allocation.preserve(%buffer : !vm.ref) + vm.import private @hal.buffer.allocation.discard(%buffer : !vm.ref) -> i32 + vm.import private @hal.buffer.allocation.is_terminal(%buffer : !vm.ref) -> i32 + vm.import private @hal.buffer.subspan(%source_buffer : !vm.ref, %source_offset : i64, %length : i64) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.buffer.length(%buffer : !vm.ref) -> i64 attributes {nosideeffects} + vm.import private @hal.buffer.load(%source_buffer : !vm.ref, %source_offset : i64, %length : i32) -> i32 + vm.import private @hal.buffer.store(%value : i32, %target_buffer : !vm.ref, %target_offset : i64, %length : i32) + vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) + vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.buffer_view.element_type(%buffer_view : !vm.ref) -> i32 attributes {nosideeffects} + vm.import private @hal.buffer_view.encoding_type(%buffer_view : !vm.ref) -> i32 attributes {nosideeffects} + vm.import private @hal.buffer_view.rank(%buffer_view : !vm.ref) -> i32 attributes {nosideeffects} + vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref, %index : i32) -> i64 attributes {nosideeffects} + vm.import private @hal.buffer_view.trace(%key : !vm.buffer, %operands : !vm.ref ...) + vm.import private @hal.channel.create(%device : !vm.ref, %queue_affinity : i64, %flags : i64, %id : !vm.buffer, %group : !vm.buffer, %rank : i32, %count : i32) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.channel.split(%channel : !vm.ref, %color : i32, %key : i32, %flags : i64) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.channel.rank_and_count(%channel : !vm.ref) -> (i32, i32) attributes {nosideeffects} + vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 6 : i32} + vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) + vm.import private @hal.command_buffer.begin_debug_group(%command_buffer : !vm.ref, %label : !vm.buffer) + vm.import private @hal.command_buffer.end_debug_group(%command_buffer : !vm.ref) + vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i64) + vm.import private @hal.command_buffer.advise_buffer(%command_buffer : !vm.ref, %buffer : !vm.ref, %flags : i64, %arg0 : i64, %arg1 : i64, %buffer_slot : i32) + vm.import private @hal.command_buffer.fill_buffer(%command_buffer : !vm.ref, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %target_buffer_slot : i32, %pattern : i64, %pattern_length : i32, %flags : i64) + vm.import private @hal.command_buffer.update_buffer(%command_buffer : !vm.ref, %source_buffer : !vm.buffer, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %target_buffer_slot : i32, %flags : i64) + vm.import private @hal.command_buffer.copy_buffer(%command_buffer : !vm.ref, %source_buffer_slot : i32, %target_buffer_slot : i32, %source_buffer : !vm.ref, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %flags : i64) + vm.import private @hal.command_buffer.collective(%command_buffer : !vm.ref, %channel : !vm.ref, %op : i32, %param : i32, %send_buffer_slot : i32, %recv_buffer_slot : i32, %send_buffer : !vm.ref, %recv_buffer : !vm.ref, %send_offset : i64, %send_length : i64, %recv_offset : i64, %recv_length : i64, %element_count : i64) + vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple, i64, i64> ...) + vm.import private @hal.command_buffer.dispatch.indirect(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref, %workgroups_offset : i64, %flags : i64, %constants : i32 ..., %bindings : tuple, i64, i64> ...) + vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects} + vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64, %flags : i64) -> !vm.ref + vm.import private @hal.device.queue.dealloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %buffer : !vm.ref, %flags : i64) + vm.import private @hal.device.queue.fill(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %pattern : i64, %pattern_length : i32, %flags : i64) + vm.import private @hal.device.queue.update(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %source_buffer : !vm.buffer, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %flags : i64) + vm.import private @hal.device.queue.copy(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %source_buffer : !vm.ref, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %flags : i64) + vm.import private @hal.device.queue.read(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %source_file : !vm.ref, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %flags : i64) + vm.import private @hal.device.queue.write(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %source_buffer : !vm.ref, %source_offset : i64, %target_file : !vm.ref, %target_offset : i64, %length : i64, %flags : i64) + vm.import private @hal.device.queue.barrier(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %flags : i64) + vm.import private @hal.device.queue.execute(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffer : !vm.ref, %flags : i64) + vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffer : !vm.ref, %flags : i64, %binding_table : tuple, i64, i64> ...) + vm.import private @hal.device.queue.flush(%device : !vm.ref, %queue_affinity : i64) + vm.import private @hal.devices.count() -> i32 attributes {nosideeffects} + vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.executable.create(%device : !vm.ref, %queue_affinity : i64, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.fence.create(%device : !vm.ref, %flags : i64) -> !vm.ref + vm.import private @hal.fence.join(%flags : i64, %fences : !vm.ref ...) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.fence.query(%fence : !vm.ref) -> i32 + vm.import private @hal.fence.signal(%fence : !vm.ref) + vm.import private @hal.fence.fail(%fence : !vm.ref, %status : i32) + vm.import private @hal.fence.await(%timeout_millis : i32, %flags : i64, %fences : !vm.ref ...) -> i32 attributes {vm.yield} + vm.rodata private @_utf8_input0_DCE99660CEB3F6B {alignment = 1 : i64} "input0" + vm.rodata private @_utf8_tensor_FC1814BC4A58F22A {alignment = 1 : i64} "tensor" + vm.rodata private @_utf8_input1_B898B726583C85DA {alignment = 1 : i64} "input1" + vm.rodata private @_utf8_tensor_FC1814BC4A58F22A_3 {alignment = 1 : i64} "tensor" + vm.func private @multiple_results(%arg0: !vm.ref, %arg1: !vm.ref) -> (!vm.ref, !vm.ref) attributes {iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c3075 = vm.const.i32 3075 + %c48 = vm.const.i32 48 + %c2 = vm.const.i64 2 + %c8 = vm.const.i64 8 + %c64 = vm.const.i64 64 + %c128 = vm.const.i64 128 + %zero = vm.const.i64.zero + %c-1 = vm.const.i64 -1 + %null = vm.const.ref.zero : !vm.ref + %zero_0 = vm.const.i64.zero + %c-1_1 = vm.const.i32 -1 + %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref + %__multiple_results_memoize_result_0_device_0 = vm.global.load.ref immutable @__multiple_results_memoize_result_0_device_0 : !vm.ref + %c553648160 = vm.const.i32 553648160 + %c1 = vm.const.i32 1 + %_utf8_input0_DCE99660CEB3F6B = vm.const.ref.rodata @_utf8_input0_DCE99660CEB3F6B : !vm.buffer + vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DCE99660CEB3F6B, %c553648160, %c1, [%c2]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) + %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref + %ref_2 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref + %_utf8_tensor_FC1814BC4A58F22A = vm.const.ref.rodata @_utf8_tensor_FC1814BC4A58F22A : !vm.buffer + %c16 = vm.const.i32 16 + %c3075_3 = vm.const.i32 3075 + vm.call @hal.buffer.assert(%ref, %_utf8_tensor_FC1814BC4A58F22A, %ref_2, %c8, %c16, %c3075_3) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () + %_utf8_input1_B898B726583C85DA = vm.const.ref.rodata @_utf8_input1_B898B726583C85DA : !vm.buffer + vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_B898B726583C85DA, %c553648160, %c1, [%c2]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) + %ref_4 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref + %_utf8_tensor_FC1814BC4A58F22A_3 = vm.const.ref.rodata @_utf8_tensor_FC1814BC4A58F22A_3 : !vm.buffer + %c16_5 = vm.const.i32 16 + %c3075_6 = vm.const.i32 3075 + vm.call @hal.buffer.assert(%ref_4, %_utf8_tensor_FC1814BC4A58F22A_3, %ref_2, %c8, %c16_5, %c3075_6) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () + %zero_7 = vm.const.i64.zero + %ref_8 = vm.call @hal.fence.create(%__device_0, %zero_7) : (!vm.ref, i64) -> !vm.ref + %zero_9 = vm.const.i64.zero + %ref_10 = vm.call @hal.device.queue.alloca(%__device_0, %c-1, %null, %ref_8, %zero_0, %c48, %c3075, %c128, %zero_9) : (!vm.ref, i64, !vm.ref, !vm.ref, i64, i32, i32, i64, i64) -> !vm.ref + %zero_11 = vm.const.i64.zero + %ref_12 = vm.call @hal.fence.create(%__device_0, %zero_11) : (!vm.ref, i64) -> !vm.ref + %zero_13 = vm.const.i64 0 + vm.call.variadic @hal.device.queue.execute.indirect(%__device_0, %c-1, %ref_8, %ref_12, %__multiple_results_memoize_result_0_device_0, %zero_13, [(%ref, %zero, %c8), (%ref_4, %zero, %c8), (%ref_10, %zero, %c128)]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref, i64, tuple, i64, i64> ...) + %zero_14 = vm.const.i64.zero + %0 = vm.call.variadic @hal.fence.await(%c-1_1, %zero_14, [%ref_12]) : (i32, i64, !vm.ref ...) -> i32 + vm.cond_fail %0, "failed to wait on timepoint" + %ref_15 = vm.call.variadic @hal.buffer_view.create(%ref_10, %zero, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref + %ref_16 = vm.call.variadic @hal.buffer_view.create(%ref_10, %c64, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref + vm.return %ref_15, %ref_16 : !vm.ref, !vm.ref + } + vm.export @multiple_results attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} +} + +// -----// IR Dump After DeduplicateRodataPass (iree-vm-deduplicate-rodata) //----- // +vm.module public @module { + vm.global.ref private @__device_0 : !vm.ref + vm.global.ref private @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref + vm.global.ref private @__multiple_results_memoize_result_0_device_0 : !vm.ref + vm.rodata private @_utf8_hal_device_id_C6650FF277232B5A {alignment = 1 : i64} "hal.device.id" + vm.rodata private @_utf8_local_1A8FF0278D7661D8 {alignment = 1 : i64} "local*" + vm.rodata private @_utf8_hal_executable_format_E03EECB63A2AAF52 {alignment = 1 : i64} "hal.executable.format" + vm.rodata private @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 {alignment = 1 : i64} "embedded-elf-arm_64" + vm.rodata private @multiple_results_dispatch_0_embedded_elf_arm_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8> + vm.initializer { + %null = vm.const.ref.zero : !vm.ref + %c14 = vm.const.i32 14 + %c-1 = vm.const.i64 -1 + %c-1_0 = vm.const.i64 -1 + %c18 = vm.const.i32 18 + %zero = vm.const.i32.zero + %zero_1 = vm.const.i64.zero + %c1 = vm.const.i64 1 + %null_2 = vm.const.ref.zero : !vm.ref + %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 + %1 = vm.ext.i32.i64.s %0 : i32 -> i64 + vm.br ^bb1(%zero_1, %zero_1, %null_2 : i64, i64, !vm.ref) + ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 + %req = vm.cmp.eq.ref %4, %null_2 : !vm.ref + %slt = vm.cmp.lt.i64.s %2, %1 : i64 + %5 = vm.and.i32 %req, %slt : i32 + vm.cond_br %5, ^bb2, ^bb5 + ^bb2: // pred: ^bb1 + %6 = vm.trunc.i64.i32 %2 : i64 -> i32 + %ref = vm.call @hal.devices.get(%6) {nosideeffects} : (i32) -> !vm.ref + %_utf8_hal_device_id_C6650FF277232B5A = vm.const.ref.rodata @_utf8_hal_device_id_C6650FF277232B5A : !vm.buffer + %_utf8_local_1A8FF0278D7661D8 = vm.const.ref.rodata @_utf8_local_1A8FF0278D7661D8 : !vm.buffer + %7:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_C6650FF277232B5A, %_utf8_local_1A8FF0278D7661D8) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) + %nz = vm.cmp.nz.i64 %7#1 : i64 + %zero_3 = vm.const.i32.zero + %8 = vm.select.i32 %7#0, %nz, %zero_3 : i32 + %c1_4 = vm.const.i32 1 + vm.cond_br %8, ^bb3, ^bb4(%zero : i32) + ^bb3: // pred: ^bb2 + %_utf8_hal_executable_format_E03EECB63A2AAF52 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer + %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer + %9:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_E03EECB63A2AAF52, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) + %nz_5 = vm.cmp.nz.i64 %9#1 : i64 + %zero_6 = vm.const.i32.zero + %10 = vm.select.i32 %9#0, %nz_5, %zero_6 : i32 + %c1_7 = vm.const.i32 1 + vm.br ^bb4(%10 : i32) + ^bb4(%11: i32): // 2 preds: ^bb2, ^bb3 + %eq = vm.cmp.eq.i64 %3, %zero_1 : i64 + %12 = vm.select.i64 %11, %c1, %zero_1 : i64 + %13 = vm.add.i64 %3, %12 : i64 + %14 = vm.and.i32 %11, %eq : i32 + %ref_8 = vm.select.ref %14, %ref, %null_2 : !vm.ref + %15 = vm.add.i64 %2, %c1 : i64 + vm.br ^bb1(%15, %13, %ref_8 : i64, i64, !vm.ref) + ^bb5: // pred: ^bb1 + vm.cond_br %req, ^bb6, ^bb7 + ^bb6: // pred: ^bb5 + vm.cond_fail %c18, "HAL device `__device_0` not found or unavailable: #hal.device.target<"local", [#hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>]>" + vm.br ^bb7 + ^bb7: // 2 preds: ^bb5, ^bb6 + %_utf8_hal_executable_format_E03EECB63A2AAF52_9 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer + %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_10 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer + %16:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_E03EECB63A2AAF52_9, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_10) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) + %nz_11 = vm.cmp.nz.i64 %16#1 : i64 + %zero_12 = vm.const.i32.zero + %17 = vm.select.i32 %16#0, %nz_11, %zero_12 : i32 + %c1_13 = vm.const.i32 1 + %18 = vm.select.i64 %17, %zero_1, %c-1 : i64 + %eq_14 = vm.cmp.eq.i64 %18, %zero_1 : i64 + vm.global.store.ref %4, @__device_0 : !vm.ref + vm.cond_br %eq_14, ^bb8, ^bb9 + ^bb8: // pred: ^bb7 + %multiple_results_dispatch_0_embedded_elf_arm_64 = vm.const.ref.rodata @multiple_results_dispatch_0_embedded_elf_arm_64 : !vm.buffer + %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_15 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer + %null_16 = vm.const.ref.zero : !vm.buffer + %ref_17 = vm.call @hal.executable.create(%4, %c-1_0, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_15, %multiple_results_dispatch_0_embedded_elf_arm_64, %null_16) {nosideeffects} : (!vm.ref, i64, !vm.buffer, !vm.buffer, !vm.buffer) -> !vm.ref + vm.br ^bb10(%ref_17 : !vm.ref) + ^bb9: // pred: ^bb7 + vm.cond_fail %c14, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + vm.br ^bb10(%null : !vm.ref) + ^bb10(%19: !vm.ref): // 2 preds: ^bb8, ^bb9 + vm.global.store.ref %19, @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref + %ref_18 = vm.call @__multiple_results_memoize_apply() : () -> !vm.ref + vm.global.store.ref %ref_18, @__multiple_results_memoize_result_0_device_0 : !vm.ref + vm.return + } + vm.func private @__multiple_results_memoize_apply() -> !vm.ref attributes {inlining_policy = #util.inline.never} { + %c1 = vm.const.i64 1 + %c64 = vm.const.i32 64 + %c128 = vm.const.i64 128 + %c8 = vm.const.i64 8 + %c2 = vm.const.i64 2 + %zero = vm.const.i64.zero + %zero_0 = vm.const.i32.zero + %c3 = vm.const.i64 3 + %c-1 = vm.const.i64 -1 + %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref + %__device_0_executable_0_multiple_results_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref + %zero_1 = vm.const.i32.zero + %c3_2 = vm.const.i32 3 + %c3_3 = vm.const.i32 3 + %ref = vm.call @hal.command_buffer.create(%__device_0, %zero_1, %c3_2, %c-1, %c3_3) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref + %zero_4 = vm.const.i32.zero + %zero_5 = vm.const.i32.zero + %c1_6 = vm.const.i32 1 + %c1_7 = vm.const.i32 1 + %c1_8 = vm.const.i32 1 + %zero_9 = vm.const.i64 0 + %zero_10 = vm.const.i32.zero + %null = vm.const.ref.zero : !vm.ref + %c2_11 = vm.const.i32 2 + %null_12 = vm.const.ref.zero : !vm.ref + vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_5, %c1_6, %c1_7, %c1_8, %zero_9, [%zero_0], [(%zero_4, %zero_10, %null, %zero, %c8), (%zero_4, %c2_11, %null_12, %zero, %c128)]) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64, i32 ..., tuple, i64, i64> ...) + %zero_13 = vm.const.i32.zero + %zero_14 = vm.const.i32.zero + %c1_15 = vm.const.i32 1 + %c1_16 = vm.const.i32 1 + %c1_17 = vm.const.i32 1 + %zero_18 = vm.const.i64 0 + %c1_19 = vm.const.i32 1 + %null_20 = vm.const.ref.zero : !vm.ref + %c2_21 = vm.const.i32 2 + %null_22 = vm.const.ref.zero : !vm.ref + vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_14, %c1_15, %c1_16, %c1_17, %zero_18, [%c64], [(%zero_13, %c1_19, %null_20, %zero, %c8), (%zero_13, %c2_21, %null_22, %zero, %c128)]) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64, i32 ..., tuple, i64, i64> ...) + %c28 = vm.const.i32 28 + %c13 = vm.const.i32 13 + %zero_23 = vm.const.i64.zero + vm.call @hal.command_buffer.execution_barrier(%ref, %c28, %c13, %zero_23) : (!vm.ref, i32, i32, i64) -> () + vm.call @hal.command_buffer.finalize(%ref) : (!vm.ref) -> () + vm.return %ref : !vm.ref + } + vm.import private @hal.ex.file.from_memory(%device : !vm.ref, %queue_affinity : i64, %access : i32, %buffer : !vm.buffer, %offset : i64, %length : i64, %flags : i32) -> !vm.ref + vm.import private @hal.allocator.select(%memory_types : i32, %buffer_usage : i32, %flags : i64, %from : tuple, i64> ...) -> (!vm.ref, i64) attributes {nosideeffects} + vm.import private @hal.allocator.allocate(%allocator : !vm.ref, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref + vm.import private @hal.allocator.import(%allocator : !vm.ref, %try : i32, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %source : !vm.buffer, %offset : i64, %length : i64) -> !vm.ref + vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) + vm.import private @hal.buffer.allocation.preserve(%buffer : !vm.ref) + vm.import private @hal.buffer.allocation.discard(%buffer : !vm.ref) -> i32 + vm.import private @hal.buffer.allocation.is_terminal(%buffer : !vm.ref) -> i32 + vm.import private @hal.buffer.subspan(%source_buffer : !vm.ref, %source_offset : i64, %length : i64) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.buffer.length(%buffer : !vm.ref) -> i64 attributes {nosideeffects} + vm.import private @hal.buffer.load(%source_buffer : !vm.ref, %source_offset : i64, %length : i32) -> i32 + vm.import private @hal.buffer.store(%value : i32, %target_buffer : !vm.ref, %target_offset : i64, %length : i32) + vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) + vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.buffer_view.element_type(%buffer_view : !vm.ref) -> i32 attributes {nosideeffects} + vm.import private @hal.buffer_view.encoding_type(%buffer_view : !vm.ref) -> i32 attributes {nosideeffects} + vm.import private @hal.buffer_view.rank(%buffer_view : !vm.ref) -> i32 attributes {nosideeffects} + vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref, %index : i32) -> i64 attributes {nosideeffects} + vm.import private @hal.buffer_view.trace(%key : !vm.buffer, %operands : !vm.ref ...) + vm.import private @hal.channel.create(%device : !vm.ref, %queue_affinity : i64, %flags : i64, %id : !vm.buffer, %group : !vm.buffer, %rank : i32, %count : i32) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.channel.split(%channel : !vm.ref, %color : i32, %key : i32, %flags : i64) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.channel.rank_and_count(%channel : !vm.ref) -> (i32, i32) attributes {nosideeffects} + vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 6 : i32} + vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) + vm.import private @hal.command_buffer.begin_debug_group(%command_buffer : !vm.ref, %label : !vm.buffer) + vm.import private @hal.command_buffer.end_debug_group(%command_buffer : !vm.ref) + vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i64) + vm.import private @hal.command_buffer.advise_buffer(%command_buffer : !vm.ref, %buffer : !vm.ref, %flags : i64, %arg0 : i64, %arg1 : i64, %buffer_slot : i32) + vm.import private @hal.command_buffer.fill_buffer(%command_buffer : !vm.ref, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %target_buffer_slot : i32, %pattern : i64, %pattern_length : i32, %flags : i64) + vm.import private @hal.command_buffer.update_buffer(%command_buffer : !vm.ref, %source_buffer : !vm.buffer, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %target_buffer_slot : i32, %flags : i64) + vm.import private @hal.command_buffer.copy_buffer(%command_buffer : !vm.ref, %source_buffer_slot : i32, %target_buffer_slot : i32, %source_buffer : !vm.ref, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %flags : i64) + vm.import private @hal.command_buffer.collective(%command_buffer : !vm.ref, %channel : !vm.ref, %op : i32, %param : i32, %send_buffer_slot : i32, %recv_buffer_slot : i32, %send_buffer : !vm.ref, %recv_buffer : !vm.ref, %send_offset : i64, %send_length : i64, %recv_offset : i64, %recv_length : i64, %element_count : i64) + vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple, i64, i64> ...) + vm.import private @hal.command_buffer.dispatch.indirect(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref, %workgroups_offset : i64, %flags : i64, %constants : i32 ..., %bindings : tuple, i64, i64> ...) + vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects} + vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64, %flags : i64) -> !vm.ref + vm.import private @hal.device.queue.dealloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %buffer : !vm.ref, %flags : i64) + vm.import private @hal.device.queue.fill(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %pattern : i64, %pattern_length : i32, %flags : i64) + vm.import private @hal.device.queue.update(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %source_buffer : !vm.buffer, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %flags : i64) + vm.import private @hal.device.queue.copy(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %source_buffer : !vm.ref, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %flags : i64) + vm.import private @hal.device.queue.read(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %source_file : !vm.ref, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %flags : i64) + vm.import private @hal.device.queue.write(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %source_buffer : !vm.ref, %source_offset : i64, %target_file : !vm.ref, %target_offset : i64, %length : i64, %flags : i64) + vm.import private @hal.device.queue.barrier(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %flags : i64) + vm.import private @hal.device.queue.execute(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffer : !vm.ref, %flags : i64) + vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffer : !vm.ref, %flags : i64, %binding_table : tuple, i64, i64> ...) + vm.import private @hal.device.queue.flush(%device : !vm.ref, %queue_affinity : i64) + vm.import private @hal.devices.count() -> i32 attributes {nosideeffects} + vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.executable.create(%device : !vm.ref, %queue_affinity : i64, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.fence.create(%device : !vm.ref, %flags : i64) -> !vm.ref + vm.import private @hal.fence.join(%flags : i64, %fences : !vm.ref ...) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.fence.query(%fence : !vm.ref) -> i32 + vm.import private @hal.fence.signal(%fence : !vm.ref) + vm.import private @hal.fence.fail(%fence : !vm.ref, %status : i32) + vm.import private @hal.fence.await(%timeout_millis : i32, %flags : i64, %fences : !vm.ref ...) -> i32 attributes {vm.yield} + vm.rodata private @_utf8_input0_DCE99660CEB3F6B {alignment = 1 : i64} "input0" + vm.rodata private @_utf8_tensor_FC1814BC4A58F22A {alignment = 1 : i64} "tensor" + vm.rodata private @_utf8_input1_B898B726583C85DA {alignment = 1 : i64} "input1" + vm.func private @multiple_results(%arg0: !vm.ref, %arg1: !vm.ref) -> (!vm.ref, !vm.ref) attributes {iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c3075 = vm.const.i32 3075 + %c48 = vm.const.i32 48 + %c2 = vm.const.i64 2 + %c8 = vm.const.i64 8 + %c64 = vm.const.i64 64 + %c128 = vm.const.i64 128 + %zero = vm.const.i64.zero + %c-1 = vm.const.i64 -1 + %null = vm.const.ref.zero : !vm.ref + %zero_0 = vm.const.i64.zero + %c-1_1 = vm.const.i32 -1 + %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref + %__multiple_results_memoize_result_0_device_0 = vm.global.load.ref immutable @__multiple_results_memoize_result_0_device_0 : !vm.ref + %c553648160 = vm.const.i32 553648160 + %c1 = vm.const.i32 1 + %_utf8_input0_DCE99660CEB3F6B = vm.const.ref.rodata @_utf8_input0_DCE99660CEB3F6B : !vm.buffer + vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DCE99660CEB3F6B, %c553648160, %c1, [%c2]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) + %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref + %ref_2 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref + %_utf8_tensor_FC1814BC4A58F22A = vm.const.ref.rodata @_utf8_tensor_FC1814BC4A58F22A : !vm.buffer + %c16 = vm.const.i32 16 + %c3075_3 = vm.const.i32 3075 + vm.call @hal.buffer.assert(%ref, %_utf8_tensor_FC1814BC4A58F22A, %ref_2, %c8, %c16, %c3075_3) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () + %_utf8_input1_B898B726583C85DA = vm.const.ref.rodata @_utf8_input1_B898B726583C85DA : !vm.buffer + vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_B898B726583C85DA, %c553648160, %c1, [%c2]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) + %ref_4 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref + %_utf8_tensor_FC1814BC4A58F22A_5 = vm.const.ref.rodata @_utf8_tensor_FC1814BC4A58F22A : !vm.buffer + %c16_6 = vm.const.i32 16 + %c3075_7 = vm.const.i32 3075 + vm.call @hal.buffer.assert(%ref_4, %_utf8_tensor_FC1814BC4A58F22A_5, %ref_2, %c8, %c16_6, %c3075_7) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () + %zero_8 = vm.const.i64.zero + %ref_9 = vm.call @hal.fence.create(%__device_0, %zero_8) : (!vm.ref, i64) -> !vm.ref + %zero_10 = vm.const.i64.zero + %ref_11 = vm.call @hal.device.queue.alloca(%__device_0, %c-1, %null, %ref_9, %zero_0, %c48, %c3075, %c128, %zero_10) : (!vm.ref, i64, !vm.ref, !vm.ref, i64, i32, i32, i64, i64) -> !vm.ref + %zero_12 = vm.const.i64.zero + %ref_13 = vm.call @hal.fence.create(%__device_0, %zero_12) : (!vm.ref, i64) -> !vm.ref + %zero_14 = vm.const.i64 0 + vm.call.variadic @hal.device.queue.execute.indirect(%__device_0, %c-1, %ref_9, %ref_13, %__multiple_results_memoize_result_0_device_0, %zero_14, [(%ref, %zero, %c8), (%ref_4, %zero, %c8), (%ref_11, %zero, %c128)]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref, i64, tuple, i64, i64> ...) + %zero_15 = vm.const.i64.zero + %0 = vm.call.variadic @hal.fence.await(%c-1_1, %zero_15, [%ref_13]) : (i32, i64, !vm.ref ...) -> i32 + vm.cond_fail %0, "failed to wait on timepoint" + %ref_16 = vm.call.variadic @hal.buffer_view.create(%ref_11, %zero, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref + %ref_17 = vm.call.variadic @hal.buffer_view.create(%ref_11, %c64, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref + vm.return %ref_16, %ref_17 : !vm.ref, !vm.ref + } + vm.export @multiple_results attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} +} + +// -----// IR Dump After DropUnusedCallsPass (iree-vm-drop-unused-calls) //----- // +vm.module public @module { + vm.global.ref private @__device_0 : !vm.ref + vm.global.ref private @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref + vm.global.ref private @__multiple_results_memoize_result_0_device_0 : !vm.ref + vm.rodata private @_utf8_hal_device_id_C6650FF277232B5A {alignment = 1 : i64} "hal.device.id" + vm.rodata private @_utf8_local_1A8FF0278D7661D8 {alignment = 1 : i64} "local*" + vm.rodata private @_utf8_hal_executable_format_E03EECB63A2AAF52 {alignment = 1 : i64} "hal.executable.format" + vm.rodata private @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 {alignment = 1 : i64} "embedded-elf-arm_64" + vm.rodata private @multiple_results_dispatch_0_embedded_elf_arm_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8> + vm.initializer { + %null = vm.const.ref.zero : !vm.buffer + %null_0 = vm.const.ref.zero : !vm.ref + %c14 = vm.const.i32 14 + %c-1 = vm.const.i64 -1 + %c18 = vm.const.i32 18 + %zero = vm.const.i32.zero + %zero_1 = vm.const.i64.zero + %c1 = vm.const.i64 1 + %null_2 = vm.const.ref.zero : !vm.ref + %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 + %1 = vm.ext.i32.i64.s %0 : i32 -> i64 + vm.br ^bb1(%zero_1, %zero_1, %null_2 : i64, i64, !vm.ref) + ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 + %req = vm.cmp.eq.ref %4, %null_2 : !vm.ref + %slt = vm.cmp.lt.i64.s %2, %1 : i64 + %5 = vm.and.i32 %req, %slt : i32 + vm.cond_br %5, ^bb2, ^bb5 + ^bb2: // pred: ^bb1 + %6 = vm.trunc.i64.i32 %2 : i64 -> i32 + %ref = vm.call @hal.devices.get(%6) {nosideeffects} : (i32) -> !vm.ref + %_utf8_hal_device_id_C6650FF277232B5A = vm.const.ref.rodata @_utf8_hal_device_id_C6650FF277232B5A : !vm.buffer + %_utf8_local_1A8FF0278D7661D8 = vm.const.ref.rodata @_utf8_local_1A8FF0278D7661D8 : !vm.buffer + %7:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_C6650FF277232B5A, %_utf8_local_1A8FF0278D7661D8) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) + %nz = vm.cmp.nz.i64 %7#1 : i64 + %8 = vm.select.i32 %7#0, %nz, %zero : i32 + vm.cond_br %8, ^bb3, ^bb4(%zero : i32) + ^bb3: // pred: ^bb2 + %_utf8_hal_executable_format_E03EECB63A2AAF52 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer + %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer + %9:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_E03EECB63A2AAF52, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) + %nz_3 = vm.cmp.nz.i64 %9#1 : i64 + %10 = vm.select.i32 %9#0, %nz_3, %zero : i32 + vm.br ^bb4(%10 : i32) + ^bb4(%11: i32): // 2 preds: ^bb2, ^bb3 + %eq = vm.cmp.eq.i64 %3, %zero_1 : i64 + %12 = vm.select.i64 %11, %c1, %zero_1 : i64 + %13 = vm.add.i64 %3, %12 : i64 + %14 = vm.and.i32 %11, %eq : i32 + %ref_4 = vm.select.ref %14, %ref, %null_2 : !vm.ref + %15 = vm.add.i64 %2, %c1 : i64 + vm.br ^bb1(%15, %13, %ref_4 : i64, i64, !vm.ref) + ^bb5: // pred: ^bb1 + vm.cond_br %req, ^bb6, ^bb7 + ^bb6: // pred: ^bb5 + vm.cond_fail %c18, "HAL device `__device_0` not found or unavailable: #hal.device.target<"local", [#hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>]>" + vm.br ^bb7 + ^bb7: // 2 preds: ^bb5, ^bb6 + %_utf8_hal_executable_format_E03EECB63A2AAF52_5 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer + %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer + %16:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_E03EECB63A2AAF52_5, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) + %nz_7 = vm.cmp.nz.i64 %16#1 : i64 + %17 = vm.select.i32 %16#0, %nz_7, %zero : i32 + %18 = vm.select.i64 %17, %zero_1, %c-1 : i64 + %eq_8 = vm.cmp.eq.i64 %18, %zero_1 : i64 + vm.global.store.ref %4, @__device_0 : !vm.ref + vm.cond_br %eq_8, ^bb8, ^bb9 + ^bb8: // pred: ^bb7 + %multiple_results_dispatch_0_embedded_elf_arm_64 = vm.const.ref.rodata @multiple_results_dispatch_0_embedded_elf_arm_64 : !vm.buffer + %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_9 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer + %ref_10 = vm.call @hal.executable.create(%4, %c-1, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_9, %multiple_results_dispatch_0_embedded_elf_arm_64, %null) {nosideeffects} : (!vm.ref, i64, !vm.buffer, !vm.buffer, !vm.buffer) -> !vm.ref + vm.br ^bb10(%ref_10 : !vm.ref) + ^bb9: // pred: ^bb7 + vm.cond_fail %c14, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + vm.br ^bb10(%null_0 : !vm.ref) + ^bb10(%19: !vm.ref): // 2 preds: ^bb8, ^bb9 + vm.global.store.ref %19, @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref + %ref_11 = vm.call @__multiple_results_memoize_apply() : () -> !vm.ref + vm.global.store.ref %ref_11, @__multiple_results_memoize_result_0_device_0 : !vm.ref + vm.return + } + vm.func private @__multiple_results_memoize_apply() -> !vm.ref attributes {inlining_policy = #util.inline.never} { + %c13 = vm.const.i32 13 + %c28 = vm.const.i32 28 + %c2 = vm.const.i32 2 + %null = vm.const.ref.zero : !vm.ref + %c1 = vm.const.i32 1 + %c3 = vm.const.i32 3 + %c64 = vm.const.i32 64 + %c128 = vm.const.i64 128 + %c8 = vm.const.i64 8 + %zero = vm.const.i64.zero + %zero_0 = vm.const.i32.zero + %c-1 = vm.const.i64 -1 + %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref + %__device_0_executable_0_multiple_results_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref + %ref = vm.call @hal.command_buffer.create(%__device_0, %zero_0, %c3, %c-1, %c3) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref + vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%zero_0], [(%zero_0, %zero_0, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64, i32 ..., tuple, i64, i64> ...) + vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%c64], [(%zero_0, %c1, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64, i32 ..., tuple, i64, i64> ...) + vm.call @hal.command_buffer.execution_barrier(%ref, %c28, %c13, %zero) : (!vm.ref, i32, i32, i64) -> () + vm.call @hal.command_buffer.finalize(%ref) : (!vm.ref) -> () + vm.return %ref : !vm.ref + } + vm.import private @hal.ex.file.from_memory(%device : !vm.ref, %queue_affinity : i64, %access : i32, %buffer : !vm.buffer, %offset : i64, %length : i64, %flags : i32) -> !vm.ref + vm.import private @hal.allocator.select(%memory_types : i32, %buffer_usage : i32, %flags : i64, %from : tuple, i64> ...) -> (!vm.ref, i64) attributes {nosideeffects} + vm.import private @hal.allocator.allocate(%allocator : !vm.ref, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref + vm.import private @hal.allocator.import(%allocator : !vm.ref, %try : i32, %queue_affinity : i64, %memory_types : i32, %buffer_usage : i32, %source : !vm.buffer, %offset : i64, %length : i64) -> !vm.ref + vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) + vm.import private @hal.buffer.allocation.preserve(%buffer : !vm.ref) + vm.import private @hal.buffer.allocation.discard(%buffer : !vm.ref) -> i32 + vm.import private @hal.buffer.allocation.is_terminal(%buffer : !vm.ref) -> i32 + vm.import private @hal.buffer.subspan(%source_buffer : !vm.ref, %source_offset : i64, %length : i64) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.buffer.length(%buffer : !vm.ref) -> i64 attributes {nosideeffects} + vm.import private @hal.buffer.load(%source_buffer : !vm.ref, %source_offset : i64, %length : i32) -> i32 + vm.import private @hal.buffer.store(%value : i32, %target_buffer : !vm.ref, %target_offset : i64, %length : i32) + vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) + vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.buffer_view.element_type(%buffer_view : !vm.ref) -> i32 attributes {nosideeffects} + vm.import private @hal.buffer_view.encoding_type(%buffer_view : !vm.ref) -> i32 attributes {nosideeffects} + vm.import private @hal.buffer_view.rank(%buffer_view : !vm.ref) -> i32 attributes {nosideeffects} + vm.import private @hal.buffer_view.dim(%buffer_view : !vm.ref, %index : i32) -> i64 attributes {nosideeffects} + vm.import private @hal.buffer_view.trace(%key : !vm.buffer, %operands : !vm.ref ...) + vm.import private @hal.channel.create(%device : !vm.ref, %queue_affinity : i64, %flags : i64, %id : !vm.buffer, %group : !vm.buffer, %rank : i32, %count : i32) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.channel.split(%channel : !vm.ref, %color : i32, %key : i32, %flags : i64) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.channel.rank_and_count(%channel : !vm.ref) -> (i32, i32) attributes {nosideeffects} + vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 6 : i32} + vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) + vm.import private @hal.command_buffer.begin_debug_group(%command_buffer : !vm.ref, %label : !vm.buffer) + vm.import private @hal.command_buffer.end_debug_group(%command_buffer : !vm.ref) + vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i64) + vm.import private @hal.command_buffer.advise_buffer(%command_buffer : !vm.ref, %buffer : !vm.ref, %flags : i64, %arg0 : i64, %arg1 : i64, %buffer_slot : i32) + vm.import private @hal.command_buffer.fill_buffer(%command_buffer : !vm.ref, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %target_buffer_slot : i32, %pattern : i64, %pattern_length : i32, %flags : i64) + vm.import private @hal.command_buffer.update_buffer(%command_buffer : !vm.ref, %source_buffer : !vm.buffer, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %target_buffer_slot : i32, %flags : i64) + vm.import private @hal.command_buffer.copy_buffer(%command_buffer : !vm.ref, %source_buffer_slot : i32, %target_buffer_slot : i32, %source_buffer : !vm.ref, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %flags : i64) + vm.import private @hal.command_buffer.collective(%command_buffer : !vm.ref, %channel : !vm.ref, %op : i32, %param : i32, %send_buffer_slot : i32, %recv_buffer_slot : i32, %send_buffer : !vm.ref, %recv_buffer : !vm.ref, %send_offset : i64, %send_length : i64, %recv_offset : i64, %recv_length : i64, %element_count : i64) + vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple, i64, i64> ...) + vm.import private @hal.command_buffer.dispatch.indirect(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroups_buffer_slot : i32, %workgroups_buffer : !vm.ref, %workgroups_offset : i64, %flags : i64, %constants : i32 ..., %bindings : tuple, i64, i64> ...) + vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects} + vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64, %flags : i64) -> !vm.ref + vm.import private @hal.device.queue.dealloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %buffer : !vm.ref, %flags : i64) + vm.import private @hal.device.queue.fill(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %pattern : i64, %pattern_length : i32, %flags : i64) + vm.import private @hal.device.queue.update(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %source_buffer : !vm.buffer, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %flags : i64) + vm.import private @hal.device.queue.copy(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %source_buffer : !vm.ref, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %flags : i64) + vm.import private @hal.device.queue.read(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %source_file : !vm.ref, %source_offset : i64, %target_buffer : !vm.ref, %target_offset : i64, %length : i64, %flags : i64) + vm.import private @hal.device.queue.write(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %source_buffer : !vm.ref, %source_offset : i64, %target_file : !vm.ref, %target_offset : i64, %length : i64, %flags : i64) + vm.import private @hal.device.queue.barrier(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %flags : i64) + vm.import private @hal.device.queue.execute(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffer : !vm.ref, %flags : i64) + vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffer : !vm.ref, %flags : i64, %binding_table : tuple, i64, i64> ...) + vm.import private @hal.device.queue.flush(%device : !vm.ref, %queue_affinity : i64) + vm.import private @hal.devices.count() -> i32 attributes {nosideeffects} + vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.executable.create(%device : !vm.ref, %queue_affinity : i64, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.fence.create(%device : !vm.ref, %flags : i64) -> !vm.ref + vm.import private @hal.fence.join(%flags : i64, %fences : !vm.ref ...) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.fence.query(%fence : !vm.ref) -> i32 + vm.import private @hal.fence.signal(%fence : !vm.ref) + vm.import private @hal.fence.fail(%fence : !vm.ref, %status : i32) + vm.import private @hal.fence.await(%timeout_millis : i32, %flags : i64, %fences : !vm.ref ...) -> i32 attributes {vm.yield} + vm.rodata private @_utf8_input0_DCE99660CEB3F6B {alignment = 1 : i64} "input0" + vm.rodata private @_utf8_tensor_FC1814BC4A58F22A {alignment = 1 : i64} "tensor" + vm.rodata private @_utf8_input1_B898B726583C85DA {alignment = 1 : i64} "input1" + vm.func private @multiple_results(%arg0: !vm.ref, %arg1: !vm.ref) -> (!vm.ref, !vm.ref) attributes {iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c16 = vm.const.i32 16 + %c1 = vm.const.i32 1 + %c553648160 = vm.const.i32 553648160 + %c3075 = vm.const.i32 3075 + %c48 = vm.const.i32 48 + %c2 = vm.const.i64 2 + %c8 = vm.const.i64 8 + %c64 = vm.const.i64 64 + %c128 = vm.const.i64 128 + %zero = vm.const.i64.zero + %c-1 = vm.const.i64 -1 + %null = vm.const.ref.zero : !vm.ref + %c-1_0 = vm.const.i32 -1 + %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref + %__multiple_results_memoize_result_0_device_0 = vm.global.load.ref immutable @__multiple_results_memoize_result_0_device_0 : !vm.ref + %_utf8_input0_DCE99660CEB3F6B = vm.const.ref.rodata @_utf8_input0_DCE99660CEB3F6B : !vm.buffer + vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DCE99660CEB3F6B, %c553648160, %c1, [%c2]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) + %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref + %ref_1 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref + %_utf8_tensor_FC1814BC4A58F22A = vm.const.ref.rodata @_utf8_tensor_FC1814BC4A58F22A : !vm.buffer + vm.call @hal.buffer.assert(%ref, %_utf8_tensor_FC1814BC4A58F22A, %ref_1, %c8, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () + %_utf8_input1_B898B726583C85DA = vm.const.ref.rodata @_utf8_input1_B898B726583C85DA : !vm.buffer + vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_B898B726583C85DA, %c553648160, %c1, [%c2]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) + %ref_2 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref + %_utf8_tensor_FC1814BC4A58F22A_3 = vm.const.ref.rodata @_utf8_tensor_FC1814BC4A58F22A : !vm.buffer + vm.call @hal.buffer.assert(%ref_2, %_utf8_tensor_FC1814BC4A58F22A_3, %ref_1, %c8, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () + %ref_4 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i64) -> !vm.ref + %ref_5 = vm.call @hal.device.queue.alloca(%__device_0, %c-1, %null, %ref_4, %zero, %c48, %c3075, %c128, %zero) : (!vm.ref, i64, !vm.ref, !vm.ref, i64, i32, i32, i64, i64) -> !vm.ref + %ref_6 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i64) -> !vm.ref + vm.call.variadic @hal.device.queue.execute.indirect(%__device_0, %c-1, %ref_4, %ref_6, %__multiple_results_memoize_result_0_device_0, %zero, [(%ref, %zero, %c8), (%ref_2, %zero, %c8), (%ref_5, %zero, %c128)]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref, i64, tuple, i64, i64> ...) + %0 = vm.call.variadic @hal.fence.await(%c-1_0, %zero, [%ref_6]) : (i32, i64, !vm.ref ...) -> i32 + vm.cond_fail %0, "failed to wait on timepoint" + %ref_7 = vm.call.variadic @hal.buffer_view.create(%ref_5, %zero, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref + %ref_8 = vm.call.variadic @hal.buffer_view.create(%ref_5, %c64, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref + vm.return %ref_7, %ref_8 : !vm.ref, !vm.ref + } + vm.export @multiple_results attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} +} + +// -----// IR Dump After SymbolDCE (symbol-dce) //----- // +module attributes {vm.toplevel} { + vm.module public @module { + vm.global.ref private @__device_0 : !vm.ref + vm.global.ref private @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref + vm.global.ref private @__multiple_results_memoize_result_0_device_0 : !vm.ref + vm.rodata private @_utf8_hal_device_id_C6650FF277232B5A {alignment = 1 : i64} "hal.device.id" + vm.rodata private @_utf8_local_1A8FF0278D7661D8 {alignment = 1 : i64} "local*" + vm.rodata private @_utf8_hal_executable_format_E03EECB63A2AAF52 {alignment = 1 : i64} "hal.executable.format" + vm.rodata private @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 {alignment = 1 : i64} "embedded-elf-arm_64" + vm.rodata private @multiple_results_dispatch_0_embedded_elf_arm_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8> + vm.initializer { + %null = vm.const.ref.zero : !vm.buffer + %null_0 = vm.const.ref.zero : !vm.ref + %c14 = vm.const.i32 14 + %c-1 = vm.const.i64 -1 + %c18 = vm.const.i32 18 + %zero = vm.const.i32.zero + %zero_1 = vm.const.i64.zero + %c1 = vm.const.i64 1 + %null_2 = vm.const.ref.zero : !vm.ref + %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 + %1 = vm.ext.i32.i64.s %0 : i32 -> i64 + vm.br ^bb1(%zero_1, %zero_1, %null_2 : i64, i64, !vm.ref) + ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 + %req = vm.cmp.eq.ref %4, %null_2 : !vm.ref + %slt = vm.cmp.lt.i64.s %2, %1 : i64 + %5 = vm.and.i32 %req, %slt : i32 + vm.cond_br %5, ^bb2, ^bb5 + ^bb2: // pred: ^bb1 + %6 = vm.trunc.i64.i32 %2 : i64 -> i32 + %ref = vm.call @hal.devices.get(%6) {nosideeffects} : (i32) -> !vm.ref + %_utf8_hal_device_id_C6650FF277232B5A = vm.const.ref.rodata @_utf8_hal_device_id_C6650FF277232B5A : !vm.buffer + %_utf8_local_1A8FF0278D7661D8 = vm.const.ref.rodata @_utf8_local_1A8FF0278D7661D8 : !vm.buffer + %7:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_C6650FF277232B5A, %_utf8_local_1A8FF0278D7661D8) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) + %nz = vm.cmp.nz.i64 %7#1 : i64 + %8 = vm.select.i32 %7#0, %nz, %zero : i32 + vm.cond_br %8, ^bb3, ^bb4(%zero : i32) + ^bb3: // pred: ^bb2 + %_utf8_hal_executable_format_E03EECB63A2AAF52 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer + %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer + %9:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_E03EECB63A2AAF52, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) + %nz_3 = vm.cmp.nz.i64 %9#1 : i64 + %10 = vm.select.i32 %9#0, %nz_3, %zero : i32 + vm.br ^bb4(%10 : i32) + ^bb4(%11: i32): // 2 preds: ^bb2, ^bb3 + %eq = vm.cmp.eq.i64 %3, %zero_1 : i64 + %12 = vm.select.i64 %11, %c1, %zero_1 : i64 + %13 = vm.add.i64 %3, %12 : i64 + %14 = vm.and.i32 %11, %eq : i32 + %ref_4 = vm.select.ref %14, %ref, %null_2 : !vm.ref + %15 = vm.add.i64 %2, %c1 : i64 + vm.br ^bb1(%15, %13, %ref_4 : i64, i64, !vm.ref) + ^bb5: // pred: ^bb1 + vm.cond_br %req, ^bb6, ^bb7 + ^bb6: // pred: ^bb5 + vm.cond_fail %c18, "HAL device `__device_0` not found or unavailable: #hal.device.target<"local", [#hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>]>" + vm.br ^bb7 + ^bb7: // 2 preds: ^bb5, ^bb6 + %_utf8_hal_executable_format_E03EECB63A2AAF52_5 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer + %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer + %16:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_E03EECB63A2AAF52_5, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) + %nz_7 = vm.cmp.nz.i64 %16#1 : i64 + %17 = vm.select.i32 %16#0, %nz_7, %zero : i32 + %18 = vm.select.i64 %17, %zero_1, %c-1 : i64 + %eq_8 = vm.cmp.eq.i64 %18, %zero_1 : i64 + vm.global.store.ref %4, @__device_0 : !vm.ref + vm.cond_br %eq_8, ^bb8, ^bb9 + ^bb8: // pred: ^bb7 + %multiple_results_dispatch_0_embedded_elf_arm_64 = vm.const.ref.rodata @multiple_results_dispatch_0_embedded_elf_arm_64 : !vm.buffer + %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_9 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer + %ref_10 = vm.call @hal.executable.create(%4, %c-1, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_9, %multiple_results_dispatch_0_embedded_elf_arm_64, %null) {nosideeffects} : (!vm.ref, i64, !vm.buffer, !vm.buffer, !vm.buffer) -> !vm.ref + vm.br ^bb10(%ref_10 : !vm.ref) + ^bb9: // pred: ^bb7 + vm.cond_fail %c14, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + vm.br ^bb10(%null_0 : !vm.ref) + ^bb10(%19: !vm.ref): // 2 preds: ^bb8, ^bb9 + vm.global.store.ref %19, @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref + %ref_11 = vm.call @__multiple_results_memoize_apply() : () -> !vm.ref + vm.global.store.ref %ref_11, @__multiple_results_memoize_result_0_device_0 : !vm.ref + vm.return + } + vm.func private @__multiple_results_memoize_apply() -> !vm.ref attributes {inlining_policy = #util.inline.never} { + %c13 = vm.const.i32 13 + %c28 = vm.const.i32 28 + %c2 = vm.const.i32 2 + %null = vm.const.ref.zero : !vm.ref + %c1 = vm.const.i32 1 + %c3 = vm.const.i32 3 + %c64 = vm.const.i32 64 + %c128 = vm.const.i64 128 + %c8 = vm.const.i64 8 + %zero = vm.const.i64.zero + %zero_0 = vm.const.i32.zero + %c-1 = vm.const.i64 -1 + %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref + %__device_0_executable_0_multiple_results_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref + %ref = vm.call @hal.command_buffer.create(%__device_0, %zero_0, %c3, %c-1, %c3) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref + vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%zero_0], [(%zero_0, %zero_0, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64, i32 ..., tuple, i64, i64> ...) + vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%c64], [(%zero_0, %c1, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64, i32 ..., tuple, i64, i64> ...) + vm.call @hal.command_buffer.execution_barrier(%ref, %c28, %c13, %zero) : (!vm.ref, i32, i32, i64) -> () + vm.call @hal.command_buffer.finalize(%ref) : (!vm.ref) -> () + vm.return %ref : !vm.ref + } + vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) + vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) + vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 6 : i32} + vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) + vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i64) + vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple, i64, i64> ...) + vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects} + vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64, %flags : i64) -> !vm.ref + vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffer : !vm.ref, %flags : i64, %binding_table : tuple, i64, i64> ...) + vm.import private @hal.devices.count() -> i32 attributes {nosideeffects} + vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.executable.create(%device : !vm.ref, %queue_affinity : i64, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.fence.create(%device : !vm.ref, %flags : i64) -> !vm.ref + vm.import private @hal.fence.await(%timeout_millis : i32, %flags : i64, %fences : !vm.ref ...) -> i32 attributes {vm.yield} + vm.rodata private @_utf8_input0_DCE99660CEB3F6B {alignment = 1 : i64} "input0" + vm.rodata private @_utf8_tensor_FC1814BC4A58F22A {alignment = 1 : i64} "tensor" + vm.rodata private @_utf8_input1_B898B726583C85DA {alignment = 1 : i64} "input1" + vm.func private @multiple_results(%arg0: !vm.ref, %arg1: !vm.ref) -> (!vm.ref, !vm.ref) attributes {iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c16 = vm.const.i32 16 + %c1 = vm.const.i32 1 + %c553648160 = vm.const.i32 553648160 + %c3075 = vm.const.i32 3075 + %c48 = vm.const.i32 48 + %c2 = vm.const.i64 2 + %c8 = vm.const.i64 8 + %c64 = vm.const.i64 64 + %c128 = vm.const.i64 128 + %zero = vm.const.i64.zero + %c-1 = vm.const.i64 -1 + %null = vm.const.ref.zero : !vm.ref + %c-1_0 = vm.const.i32 -1 + %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref + %__multiple_results_memoize_result_0_device_0 = vm.global.load.ref immutable @__multiple_results_memoize_result_0_device_0 : !vm.ref + %_utf8_input0_DCE99660CEB3F6B = vm.const.ref.rodata @_utf8_input0_DCE99660CEB3F6B : !vm.buffer + vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DCE99660CEB3F6B, %c553648160, %c1, [%c2]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) + %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref + %ref_1 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref + %_utf8_tensor_FC1814BC4A58F22A = vm.const.ref.rodata @_utf8_tensor_FC1814BC4A58F22A : !vm.buffer + vm.call @hal.buffer.assert(%ref, %_utf8_tensor_FC1814BC4A58F22A, %ref_1, %c8, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () + %_utf8_input1_B898B726583C85DA = vm.const.ref.rodata @_utf8_input1_B898B726583C85DA : !vm.buffer + vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_B898B726583C85DA, %c553648160, %c1, [%c2]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) + %ref_2 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref + %_utf8_tensor_FC1814BC4A58F22A_3 = vm.const.ref.rodata @_utf8_tensor_FC1814BC4A58F22A : !vm.buffer + vm.call @hal.buffer.assert(%ref_2, %_utf8_tensor_FC1814BC4A58F22A_3, %ref_1, %c8, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () + %ref_4 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i64) -> !vm.ref + %ref_5 = vm.call @hal.device.queue.alloca(%__device_0, %c-1, %null, %ref_4, %zero, %c48, %c3075, %c128, %zero) : (!vm.ref, i64, !vm.ref, !vm.ref, i64, i32, i32, i64, i64) -> !vm.ref + %ref_6 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i64) -> !vm.ref + vm.call.variadic @hal.device.queue.execute.indirect(%__device_0, %c-1, %ref_4, %ref_6, %__multiple_results_memoize_result_0_device_0, %zero, [(%ref, %zero, %c8), (%ref_2, %zero, %c8), (%ref_5, %zero, %c128)]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref, i64, tuple, i64, i64> ...) + %0 = vm.call.variadic @hal.fence.await(%c-1_0, %zero, [%ref_6]) : (i32, i64, !vm.ref ...) -> i32 + vm.cond_fail %0, "failed to wait on timepoint" + %ref_7 = vm.call.variadic @hal.buffer_view.create(%ref_5, %zero, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref + %ref_8 = vm.call.variadic @hal.buffer_view.create(%ref_5, %c64, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref + vm.return %ref_7, %ref_8 : !vm.ref, !vm.ref + } + vm.export @multiple_results attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} + } +} + + +// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // +module attributes {vm.toplevel} { + vm.module public @module { + vm.global.ref private @__device_0 : !vm.ref + vm.global.ref private @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref + vm.global.ref private @__multiple_results_memoize_result_0_device_0 : !vm.ref + vm.rodata private @_utf8_hal_device_id_C6650FF277232B5A {alignment = 1 : i64} "hal.device.id" + vm.rodata private @_utf8_local_1A8FF0278D7661D8 {alignment = 1 : i64} "local*" + vm.rodata private @_utf8_hal_executable_format_E03EECB63A2AAF52 {alignment = 1 : i64} "hal.executable.format" + vm.rodata private @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 {alignment = 1 : i64} "embedded-elf-arm_64" + vm.rodata private @multiple_results_dispatch_0_embedded_elf_arm_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8> + vm.initializer { + %c1 = vm.const.i32 1 + %null = vm.const.ref.zero : !vm.buffer + %c14 = vm.const.i32 14 + %c-1 = vm.const.i64 -1 + %c18 = vm.const.i32 18 + %zero = vm.const.i32.zero + %zero_0 = vm.const.i64.zero + %c1_1 = vm.const.i64 1 + %null_2 = vm.const.ref.zero : !vm.ref + %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 + %1 = vm.ext.i32.i64.s %0 : i32 -> i64 + vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref) + ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 + %rnz = vm.cmp.nz.ref %4 : !vm.ref + %5 = vm.xor.i32 %rnz, %c1 : i32 + %slt = vm.cmp.lt.i64.s %2, %1 : i64 + %6 = vm.and.i32 %5, %slt : i32 + vm.cond_br %6, ^bb2, ^bb5 + ^bb2: // pred: ^bb1 + %7 = vm.trunc.i64.i32 %2 : i64 -> i32 + %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref + %_utf8_hal_device_id_C6650FF277232B5A = vm.const.ref.rodata @_utf8_hal_device_id_C6650FF277232B5A : !vm.buffer + %_utf8_local_1A8FF0278D7661D8 = vm.const.ref.rodata @_utf8_local_1A8FF0278D7661D8 : !vm.buffer + %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_C6650FF277232B5A, %_utf8_local_1A8FF0278D7661D8) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) + %nz = vm.cmp.nz.i64 %8#1 : i64 + %9 = vm.select.i32 %8#0, %nz, %zero : i32 + vm.cond_br %9, ^bb3, ^bb4(%zero : i32) + ^bb3: // pred: ^bb2 + %_utf8_hal_executable_format_E03EECB63A2AAF52 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer + %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer + %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_E03EECB63A2AAF52, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) + %nz_3 = vm.cmp.nz.i64 %10#1 : i64 + %11 = vm.select.i32 %10#0, %nz_3, %zero : i32 + vm.br ^bb4(%11 : i32) + ^bb4(%12: i32): // 2 preds: ^bb2, ^bb3 + %eq = vm.cmp.eq.i64 %3, %zero_0 : i64 + %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64 + %14 = vm.add.i64 %3, %13 : i64 + %15 = vm.and.i32 %12, %eq : i32 + %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref + %16 = vm.add.i64 %2, %c1_1 : i64 + vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref) + ^bb5: // pred: ^bb1 + vm.cond_br %5, ^bb6, ^bb7 + ^bb6: // pred: ^bb5 + vm.fail %c18, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>" + ^bb7: // pred: ^bb5 + %_utf8_hal_executable_format_E03EECB63A2AAF52_5 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer + %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer + %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_E03EECB63A2AAF52_5, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) + %nz_7 = vm.cmp.nz.i64 %17#1 : i64 + %18 = vm.select.i32 %17#0, %nz_7, %zero : i32 + %19 = vm.select.i64 %18, %zero_0, %c-1 : i64 + %eq_8 = vm.cmp.eq.i64 %19, %zero_0 : i64 + vm.global.store.ref %4, @__device_0 : !vm.ref + vm.cond_br %eq_8, ^bb8, ^bb9 + ^bb8: // pred: ^bb7 + %multiple_results_dispatch_0_embedded_elf_arm_64 = vm.const.ref.rodata @multiple_results_dispatch_0_embedded_elf_arm_64 : !vm.buffer + %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_9 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer + %ref_10 = vm.call @hal.executable.create(%4, %c-1, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_9, %multiple_results_dispatch_0_embedded_elf_arm_64, %null) {nosideeffects} : (!vm.ref, i64, !vm.buffer, !vm.buffer, !vm.buffer) -> !vm.ref + vm.global.store.ref %ref_10, @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref + %ref_11 = vm.call @__multiple_results_memoize_apply() : () -> !vm.ref + vm.global.store.ref %ref_11, @__multiple_results_memoize_result_0_device_0 : !vm.ref + vm.return + ^bb9: // pred: ^bb7 + vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + } + vm.func private @__multiple_results_memoize_apply() -> !vm.ref attributes {inlining_policy = #util.inline.never} { + %c13 = vm.const.i32 13 + %c28 = vm.const.i32 28 + %c2 = vm.const.i32 2 + %null = vm.const.ref.zero : !vm.ref + %c1 = vm.const.i32 1 + %c3 = vm.const.i32 3 + %c64 = vm.const.i32 64 + %c128 = vm.const.i64 128 + %c8 = vm.const.i64 8 + %zero = vm.const.i64.zero + %zero_0 = vm.const.i32.zero + %c-1 = vm.const.i64 -1 + %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref + %__device_0_executable_0_multiple_results_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref + %ref = vm.call @hal.command_buffer.create(%__device_0, %zero_0, %c3, %c-1, %c3) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref + vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%zero_0], [(%zero_0, %zero_0, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64, i32 ..., tuple, i64, i64> ...) + vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%c64], [(%zero_0, %c1, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64, i32 ..., tuple, i64, i64> ...) + vm.call @hal.command_buffer.execution_barrier(%ref, %c28, %c13, %zero) : (!vm.ref, i32, i32, i64) -> () + vm.call @hal.command_buffer.finalize(%ref) : (!vm.ref) -> () + vm.return %ref : !vm.ref + } + vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) + vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) + vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 6 : i32} + vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) + vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i64) + vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple, i64, i64> ...) + vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects} + vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64, %flags : i64) -> !vm.ref + vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffer : !vm.ref, %flags : i64, %binding_table : tuple, i64, i64> ...) + vm.import private @hal.devices.count() -> i32 attributes {nosideeffects} + vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.executable.create(%device : !vm.ref, %queue_affinity : i64, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.fence.create(%device : !vm.ref, %flags : i64) -> !vm.ref + vm.import private @hal.fence.await(%timeout_millis : i32, %flags : i64, %fences : !vm.ref ...) -> i32 attributes {vm.yield} + vm.rodata private @_utf8_input0_DCE99660CEB3F6B {alignment = 1 : i64} "input0" + vm.rodata private @_utf8_tensor_FC1814BC4A58F22A {alignment = 1 : i64} "tensor" + vm.rodata private @_utf8_input1_B898B726583C85DA {alignment = 1 : i64} "input1" + vm.func private @multiple_results(%arg0: !vm.ref, %arg1: !vm.ref) -> (!vm.ref, !vm.ref) attributes {iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c16 = vm.const.i32 16 + %c1 = vm.const.i32 1 + %c553648160 = vm.const.i32 553648160 + %c3075 = vm.const.i32 3075 + %c48 = vm.const.i32 48 + %c2 = vm.const.i64 2 + %c8 = vm.const.i64 8 + %c64 = vm.const.i64 64 + %c128 = vm.const.i64 128 + %zero = vm.const.i64.zero + %c-1 = vm.const.i64 -1 + %null = vm.const.ref.zero : !vm.ref + %c-1_0 = vm.const.i32 -1 + %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref + %__multiple_results_memoize_result_0_device_0 = vm.global.load.ref immutable @__multiple_results_memoize_result_0_device_0 : !vm.ref + %_utf8_input0_DCE99660CEB3F6B = vm.const.ref.rodata @_utf8_input0_DCE99660CEB3F6B : !vm.buffer + vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DCE99660CEB3F6B, %c553648160, %c1, [%c2]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) + %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref + %ref_1 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref + %_utf8_tensor_FC1814BC4A58F22A = vm.const.ref.rodata @_utf8_tensor_FC1814BC4A58F22A : !vm.buffer + vm.call @hal.buffer.assert(%ref, %_utf8_tensor_FC1814BC4A58F22A, %ref_1, %c8, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () + %_utf8_input1_B898B726583C85DA = vm.const.ref.rodata @_utf8_input1_B898B726583C85DA : !vm.buffer + vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_B898B726583C85DA, %c553648160, %c1, [%c2]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) + %ref_2 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref + %_utf8_tensor_FC1814BC4A58F22A_3 = vm.const.ref.rodata @_utf8_tensor_FC1814BC4A58F22A : !vm.buffer + vm.call @hal.buffer.assert(%ref_2, %_utf8_tensor_FC1814BC4A58F22A_3, %ref_1, %c8, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () + %ref_4 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i64) -> !vm.ref + %ref_5 = vm.call @hal.device.queue.alloca(%__device_0, %c-1, %null, %ref_4, %zero, %c48, %c3075, %c128, %zero) : (!vm.ref, i64, !vm.ref, !vm.ref, i64, i32, i32, i64, i64) -> !vm.ref + %ref_6 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i64) -> !vm.ref + vm.call.variadic @hal.device.queue.execute.indirect(%__device_0, %c-1, %ref_4, %ref_6, %__multiple_results_memoize_result_0_device_0, %zero, [(%ref, %zero, %c8), (%ref_2, %zero, %c8), (%ref_5, %zero, %c128)]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref, i64, tuple, i64, i64> ...) + %0 = vm.call.variadic @hal.fence.await(%c-1_0, %zero, [%ref_6]) : (i32, i64, !vm.ref ...) -> i32 + vm.cond_br %0, ^bb2(%0 : i32), ^bb1 + ^bb1: // pred: ^bb0 + %ref_7 = vm.call.variadic @hal.buffer_view.create(%ref_5, %zero, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref + %ref_8 = vm.call.variadic @hal.buffer_view.create(%ref_5, %c64, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref + vm.return %ref_7, %ref_8 : !vm.ref, !vm.ref + ^bb2(%1: i32): // pred: ^bb0 + vm.fail %1, "failed to wait on timepoint" + } + vm.export @multiple_results attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} + } +} + + +// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- // +module attributes {vm.toplevel} { + vm.module public @module { + vm.global.ref private @__device_0 : !vm.ref + vm.global.ref private @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref + vm.global.ref private @__multiple_results_memoize_result_0_device_0 : !vm.ref + vm.rodata private @_utf8_hal_device_id_C6650FF277232B5A {alignment = 1 : i64} "hal.device.id" + vm.rodata private @_utf8_local_1A8FF0278D7661D8 {alignment = 1 : i64} "local*" + vm.rodata private @_utf8_hal_executable_format_E03EECB63A2AAF52 {alignment = 1 : i64} "hal.executable.format" + vm.rodata private @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 {alignment = 1 : i64} "embedded-elf-arm_64" + vm.rodata private @multiple_results_dispatch_0_embedded_elf_arm_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8> + vm.initializer { + %c1 = vm.const.i32 1 + %null = vm.const.ref.zero : !vm.buffer + %c14 = vm.const.i32 14 + %c-1 = vm.const.i64 -1 + %c18 = vm.const.i32 18 + %zero = vm.const.i32.zero + %zero_0 = vm.const.i64.zero + %c1_1 = vm.const.i64 1 + %null_2 = vm.const.ref.zero : !vm.ref + %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 + %1 = vm.ext.i32.i64.s %0 : i32 -> i64 + vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref) + ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 + %rnz = vm.cmp.nz.ref %4 : !vm.ref + %5 = vm.xor.i32 %rnz, %c1 : i32 + %slt = vm.cmp.lt.i64.s %2, %1 : i64 + %6 = vm.and.i32 %5, %slt : i32 + vm.cond_br %6, ^bb2, ^bb5 + ^bb2: // pred: ^bb1 + %7 = vm.trunc.i64.i32 %2 : i64 -> i32 + %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref + %_utf8_hal_device_id_C6650FF277232B5A = vm.const.ref.rodata @_utf8_hal_device_id_C6650FF277232B5A : !vm.buffer + %_utf8_local_1A8FF0278D7661D8 = vm.const.ref.rodata @_utf8_local_1A8FF0278D7661D8 : !vm.buffer + %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_C6650FF277232B5A, %_utf8_local_1A8FF0278D7661D8) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) + %nz = vm.cmp.nz.i64 %8#1 : i64 + %9 = vm.select.i32 %8#0, %nz, %zero : i32 + vm.cond_br %9, ^bb3, ^bb4(%zero : i32) + ^bb3: // pred: ^bb2 + %_utf8_hal_executable_format_E03EECB63A2AAF52 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer + %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer + %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_E03EECB63A2AAF52, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) + %nz_3 = vm.cmp.nz.i64 %10#1 : i64 + %11 = vm.select.i32 %10#0, %nz_3, %zero : i32 + vm.br ^bb4(%11 : i32) + ^bb4(%12: i32): // 2 preds: ^bb2, ^bb3 + %eq = vm.cmp.eq.i64 %3, %zero_0 : i64 + %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64 + %14 = vm.add.i64 %3, %13 : i64 + %15 = vm.and.i32 %12, %eq : i32 + %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref + %16 = vm.add.i64 %2, %c1_1 : i64 + vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref) + ^bb5: // pred: ^bb1 + vm.cond_br %5, ^bb6, ^bb7 + ^bb6: // pred: ^bb5 + vm.fail %c18, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>" + ^bb7: // pred: ^bb5 + %_utf8_hal_executable_format_E03EECB63A2AAF52_5 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer + %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer + %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_E03EECB63A2AAF52_5, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) + %nz_7 = vm.cmp.nz.i64 %17#1 : i64 + %18 = vm.select.i32 %17#0, %nz_7, %zero : i32 + %19 = vm.select.i64 %18, %zero_0, %c-1 : i64 + %eq_8 = vm.cmp.eq.i64 %19, %zero_0 : i64 + vm.global.store.ref %4, @__device_0 : !vm.ref + vm.cond_br %eq_8, ^bb8, ^bb9 + ^bb8: // pred: ^bb7 + %multiple_results_dispatch_0_embedded_elf_arm_64 = vm.const.ref.rodata @multiple_results_dispatch_0_embedded_elf_arm_64 : !vm.buffer + %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_9 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer + %ref_10 = vm.call @hal.executable.create(%4, %c-1, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_9, %multiple_results_dispatch_0_embedded_elf_arm_64, %null) {nosideeffects} : (!vm.ref, i64, !vm.buffer, !vm.buffer, !vm.buffer) -> !vm.ref + vm.global.store.ref %ref_10, @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref + %ref_11 = vm.call @__multiple_results_memoize_apply() : () -> !vm.ref + vm.global.store.ref %ref_11, @__multiple_results_memoize_result_0_device_0 : !vm.ref + vm.return + ^bb9: // pred: ^bb7 + vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + } + vm.func private @__multiple_results_memoize_apply() -> !vm.ref attributes {inlining_policy = #util.inline.never} { + %c13 = vm.const.i32 13 + %c28 = vm.const.i32 28 + %c2 = vm.const.i32 2 + %null = vm.const.ref.zero : !vm.ref + %c1 = vm.const.i32 1 + %c3 = vm.const.i32 3 + %c64 = vm.const.i32 64 + %c128 = vm.const.i64 128 + %c8 = vm.const.i64 8 + %zero = vm.const.i64.zero + %zero_0 = vm.const.i32.zero + %c-1 = vm.const.i64 -1 + %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref + %__device_0_executable_0_multiple_results_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref + %ref = vm.call @hal.command_buffer.create(%__device_0, %zero_0, %c3, %c-1, %c3) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref + vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%zero_0], [(%zero_0, %zero_0, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64, i32 ..., tuple, i64, i64> ...) + vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%c64], [(%zero_0, %c1, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64, i32 ..., tuple, i64, i64> ...) + vm.call @hal.command_buffer.execution_barrier(%ref, %c28, %c13, %zero) : (!vm.ref, i32, i32, i64) -> () + vm.call @hal.command_buffer.finalize(%ref) : (!vm.ref) -> () + vm.return %ref : !vm.ref + } + vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) + vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) + vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 6 : i32} + vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) + vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i64) + vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple, i64, i64> ...) + vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects} + vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64, %flags : i64) -> !vm.ref + vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffer : !vm.ref, %flags : i64, %binding_table : tuple, i64, i64> ...) + vm.import private @hal.devices.count() -> i32 attributes {nosideeffects} + vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.executable.create(%device : !vm.ref, %queue_affinity : i64, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.fence.create(%device : !vm.ref, %flags : i64) -> !vm.ref + vm.import private @hal.fence.await(%timeout_millis : i32, %flags : i64, %fences : !vm.ref ...) -> i32 attributes {vm.yield} + vm.rodata private @_utf8_input0_DCE99660CEB3F6B {alignment = 1 : i64} "input0" + vm.rodata private @_utf8_tensor_FC1814BC4A58F22A {alignment = 1 : i64} "tensor" + vm.rodata private @_utf8_input1_B898B726583C85DA {alignment = 1 : i64} "input1" + vm.func private @multiple_results(%arg0: !vm.ref, %arg1: !vm.ref) -> (!vm.ref, !vm.ref) attributes {iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c16 = vm.const.i32 16 + %c1 = vm.const.i32 1 + %c553648160 = vm.const.i32 553648160 + %c3075 = vm.const.i32 3075 + %c48 = vm.const.i32 48 + %c2 = vm.const.i64 2 + %c8 = vm.const.i64 8 + %c64 = vm.const.i64 64 + %c128 = vm.const.i64 128 + %zero = vm.const.i64.zero + %c-1 = vm.const.i64 -1 + %null = vm.const.ref.zero : !vm.ref + %c-1_0 = vm.const.i32 -1 + %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref + %__multiple_results_memoize_result_0_device_0 = vm.global.load.ref immutable @__multiple_results_memoize_result_0_device_0 : !vm.ref + %_utf8_input0_DCE99660CEB3F6B = vm.const.ref.rodata @_utf8_input0_DCE99660CEB3F6B : !vm.buffer + vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DCE99660CEB3F6B, %c553648160, %c1, [%c2]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) + %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref + %ref_1 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref + %_utf8_tensor_FC1814BC4A58F22A = vm.const.ref.rodata @_utf8_tensor_FC1814BC4A58F22A : !vm.buffer + vm.call @hal.buffer.assert(%ref, %_utf8_tensor_FC1814BC4A58F22A, %ref_1, %c8, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () + %_utf8_input1_B898B726583C85DA = vm.const.ref.rodata @_utf8_input1_B898B726583C85DA : !vm.buffer + vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_B898B726583C85DA, %c553648160, %c1, [%c2]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) + %ref_2 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref + %_utf8_tensor_FC1814BC4A58F22A_3 = vm.const.ref.rodata @_utf8_tensor_FC1814BC4A58F22A : !vm.buffer + vm.call @hal.buffer.assert(%ref_2, %_utf8_tensor_FC1814BC4A58F22A_3, %ref_1, %c8, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () + %ref_4 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i64) -> !vm.ref + %ref_5 = vm.call @hal.device.queue.alloca(%__device_0, %c-1, %null, %ref_4, %zero, %c48, %c3075, %c128, %zero) : (!vm.ref, i64, !vm.ref, !vm.ref, i64, i32, i32, i64, i64) -> !vm.ref + %ref_6 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i64) -> !vm.ref + vm.call.variadic @hal.device.queue.execute.indirect(%__device_0, %c-1, %ref_4, %ref_6, %__multiple_results_memoize_result_0_device_0, %zero, [(%ref, %zero, %c8), (%ref_2, %zero, %c8), (%ref_5, %zero, %c128)]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref, i64, tuple, i64, i64> ...) + %0 = vm.call.variadic @hal.fence.await(%c-1_0, %zero, [%ref_6]) : (i32, i64, !vm.ref ...) -> i32 + vm.cond_br %0, ^bb2(%0 : i32), ^bb1 + ^bb1: // pred: ^bb0 + %ref_7 = vm.call.variadic @hal.buffer_view.create(%ref_5, %zero, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref + %ref_8 = vm.call.variadic @hal.buffer_view.create(%ref_5, %c64, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref + vm.return %ref_7, %ref_8 : !vm.ref, !vm.ref + ^bb2(%1: i32): // pred: ^bb0 + vm.fail %1, "failed to wait on timepoint" + } + vm.export @multiple_results attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} + } +} + + +// -----// IR Dump After ResolveRodataLoadsPass (iree-vm-resolve-rodata-loads) //----- // +vm.module public @module { + vm.global.ref private @__device_0 : !vm.ref + vm.global.ref private @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref + vm.global.ref private @__multiple_results_memoize_result_0_device_0 : !vm.ref + vm.rodata private @_utf8_hal_device_id_C6650FF277232B5A {alignment = 1 : i64} "hal.device.id" + vm.rodata private @_utf8_local_1A8FF0278D7661D8 {alignment = 1 : i64} "local*" + vm.rodata private @_utf8_hal_executable_format_E03EECB63A2AAF52 {alignment = 1 : i64} "hal.executable.format" + vm.rodata private @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 {alignment = 1 : i64} "embedded-elf-arm_64" + vm.rodata private @multiple_results_dispatch_0_embedded_elf_arm_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8> + vm.initializer { + %c1 = vm.const.i32 1 + %null = vm.const.ref.zero : !vm.buffer + %c14 = vm.const.i32 14 + %c-1 = vm.const.i64 -1 + %c18 = vm.const.i32 18 + %zero = vm.const.i32.zero + %zero_0 = vm.const.i64.zero + %c1_1 = vm.const.i64 1 + %null_2 = vm.const.ref.zero : !vm.ref + %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 + %1 = vm.ext.i32.i64.s %0 : i32 -> i64 + vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref) + ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 + %rnz = vm.cmp.nz.ref %4 : !vm.ref + %5 = vm.xor.i32 %rnz, %c1 : i32 + %slt = vm.cmp.lt.i64.s %2, %1 : i64 + %6 = vm.and.i32 %5, %slt : i32 + vm.cond_br %6, ^bb2, ^bb5 + ^bb2: // pred: ^bb1 + %7 = vm.trunc.i64.i32 %2 : i64 -> i32 + %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref + %_utf8_hal_device_id_C6650FF277232B5A = vm.const.ref.rodata @_utf8_hal_device_id_C6650FF277232B5A : !vm.buffer + %_utf8_local_1A8FF0278D7661D8 = vm.const.ref.rodata @_utf8_local_1A8FF0278D7661D8 : !vm.buffer + %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_C6650FF277232B5A, %_utf8_local_1A8FF0278D7661D8) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) + %nz = vm.cmp.nz.i64 %8#1 : i64 + %9 = vm.select.i32 %8#0, %nz, %zero : i32 + vm.cond_br %9, ^bb3, ^bb4(%zero : i32) + ^bb3: // pred: ^bb2 + %_utf8_hal_executable_format_E03EECB63A2AAF52 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer + %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer + %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_E03EECB63A2AAF52, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) + %nz_3 = vm.cmp.nz.i64 %10#1 : i64 + %11 = vm.select.i32 %10#0, %nz_3, %zero : i32 + vm.br ^bb4(%11 : i32) + ^bb4(%12: i32): // 2 preds: ^bb2, ^bb3 + %eq = vm.cmp.eq.i64 %3, %zero_0 : i64 + %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64 + %14 = vm.add.i64 %3, %13 : i64 + %15 = vm.and.i32 %12, %eq : i32 + %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref + %16 = vm.add.i64 %2, %c1_1 : i64 + vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref) + ^bb5: // pred: ^bb1 + vm.cond_br %5, ^bb6, ^bb7 + ^bb6: // pred: ^bb5 + vm.fail %c18, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>" + ^bb7: // pred: ^bb5 + %_utf8_hal_executable_format_E03EECB63A2AAF52_5 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer + %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer + %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_E03EECB63A2AAF52_5, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) + %nz_7 = vm.cmp.nz.i64 %17#1 : i64 + %18 = vm.select.i32 %17#0, %nz_7, %zero : i32 + %19 = vm.select.i64 %18, %zero_0, %c-1 : i64 + %eq_8 = vm.cmp.eq.i64 %19, %zero_0 : i64 + vm.global.store.ref %4, @__device_0 : !vm.ref + vm.cond_br %eq_8, ^bb8, ^bb9 + ^bb8: // pred: ^bb7 + %multiple_results_dispatch_0_embedded_elf_arm_64 = vm.const.ref.rodata @multiple_results_dispatch_0_embedded_elf_arm_64 : !vm.buffer + %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_9 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer + %ref_10 = vm.call @hal.executable.create(%4, %c-1, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_9, %multiple_results_dispatch_0_embedded_elf_arm_64, %null) {nosideeffects} : (!vm.ref, i64, !vm.buffer, !vm.buffer, !vm.buffer) -> !vm.ref + vm.global.store.ref %ref_10, @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref + %ref_11 = vm.call @__multiple_results_memoize_apply() : () -> !vm.ref + vm.global.store.ref %ref_11, @__multiple_results_memoize_result_0_device_0 : !vm.ref + vm.return + ^bb9: // pred: ^bb7 + vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + } + vm.func private @__multiple_results_memoize_apply() -> !vm.ref attributes {inlining_policy = #util.inline.never} { + %c13 = vm.const.i32 13 + %c28 = vm.const.i32 28 + %c2 = vm.const.i32 2 + %null = vm.const.ref.zero : !vm.ref + %c1 = vm.const.i32 1 + %c3 = vm.const.i32 3 + %c64 = vm.const.i32 64 + %c128 = vm.const.i64 128 + %c8 = vm.const.i64 8 + %zero = vm.const.i64.zero + %zero_0 = vm.const.i32.zero + %c-1 = vm.const.i64 -1 + %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref + %__device_0_executable_0_multiple_results_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref + %ref = vm.call @hal.command_buffer.create(%__device_0, %zero_0, %c3, %c-1, %c3) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref + vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%zero_0], [(%zero_0, %zero_0, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64, i32 ..., tuple, i64, i64> ...) + vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%c64], [(%zero_0, %c1, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64, i32 ..., tuple, i64, i64> ...) + vm.call @hal.command_buffer.execution_barrier(%ref, %c28, %c13, %zero) : (!vm.ref, i32, i32, i64) -> () + vm.call @hal.command_buffer.finalize(%ref) : (!vm.ref) -> () + vm.return %ref : !vm.ref + } + vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) + vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) + vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 6 : i32} + vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) + vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i64) + vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple, i64, i64> ...) + vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects} + vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64, %flags : i64) -> !vm.ref + vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffer : !vm.ref, %flags : i64, %binding_table : tuple, i64, i64> ...) + vm.import private @hal.devices.count() -> i32 attributes {nosideeffects} + vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.executable.create(%device : !vm.ref, %queue_affinity : i64, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.fence.create(%device : !vm.ref, %flags : i64) -> !vm.ref + vm.import private @hal.fence.await(%timeout_millis : i32, %flags : i64, %fences : !vm.ref ...) -> i32 attributes {vm.yield} + vm.rodata private @_utf8_input0_DCE99660CEB3F6B {alignment = 1 : i64} "input0" + vm.rodata private @_utf8_tensor_FC1814BC4A58F22A {alignment = 1 : i64} "tensor" + vm.rodata private @_utf8_input1_B898B726583C85DA {alignment = 1 : i64} "input1" + vm.func private @multiple_results(%arg0: !vm.ref, %arg1: !vm.ref) -> (!vm.ref, !vm.ref) attributes {iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c16 = vm.const.i32 16 + %c1 = vm.const.i32 1 + %c553648160 = vm.const.i32 553648160 + %c3075 = vm.const.i32 3075 + %c48 = vm.const.i32 48 + %c2 = vm.const.i64 2 + %c8 = vm.const.i64 8 + %c64 = vm.const.i64 64 + %c128 = vm.const.i64 128 + %zero = vm.const.i64.zero + %c-1 = vm.const.i64 -1 + %null = vm.const.ref.zero : !vm.ref + %c-1_0 = vm.const.i32 -1 + %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref + %__multiple_results_memoize_result_0_device_0 = vm.global.load.ref immutable @__multiple_results_memoize_result_0_device_0 : !vm.ref + %_utf8_input0_DCE99660CEB3F6B = vm.const.ref.rodata @_utf8_input0_DCE99660CEB3F6B : !vm.buffer + vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DCE99660CEB3F6B, %c553648160, %c1, [%c2]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) + %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref + %ref_1 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref + %_utf8_tensor_FC1814BC4A58F22A = vm.const.ref.rodata @_utf8_tensor_FC1814BC4A58F22A : !vm.buffer + vm.call @hal.buffer.assert(%ref, %_utf8_tensor_FC1814BC4A58F22A, %ref_1, %c8, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () + %_utf8_input1_B898B726583C85DA = vm.const.ref.rodata @_utf8_input1_B898B726583C85DA : !vm.buffer + vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_B898B726583C85DA, %c553648160, %c1, [%c2]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) + %ref_2 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref + %_utf8_tensor_FC1814BC4A58F22A_3 = vm.const.ref.rodata @_utf8_tensor_FC1814BC4A58F22A : !vm.buffer + vm.call @hal.buffer.assert(%ref_2, %_utf8_tensor_FC1814BC4A58F22A_3, %ref_1, %c8, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () + %ref_4 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i64) -> !vm.ref + %ref_5 = vm.call @hal.device.queue.alloca(%__device_0, %c-1, %null, %ref_4, %zero, %c48, %c3075, %c128, %zero) : (!vm.ref, i64, !vm.ref, !vm.ref, i64, i32, i32, i64, i64) -> !vm.ref + %ref_6 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i64) -> !vm.ref + vm.call.variadic @hal.device.queue.execute.indirect(%__device_0, %c-1, %ref_4, %ref_6, %__multiple_results_memoize_result_0_device_0, %zero, [(%ref, %zero, %c8), (%ref_2, %zero, %c8), (%ref_5, %zero, %c128)]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref, i64, tuple, i64, i64> ...) + %0 = vm.call.variadic @hal.fence.await(%c-1_0, %zero, [%ref_6]) : (i32, i64, !vm.ref ...) -> i32 + vm.cond_br %0, ^bb2(%0 : i32), ^bb1 + ^bb1: // pred: ^bb0 + %ref_7 = vm.call.variadic @hal.buffer_view.create(%ref_5, %zero, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref + %ref_8 = vm.call.variadic @hal.buffer_view.create(%ref_5, %c64, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref + vm.return %ref_7, %ref_8 : !vm.ref, !vm.ref + ^bb2(%1: i32): // pred: ^bb0 + vm.fail %1, "failed to wait on timepoint" + } + vm.export @multiple_results attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} +} + +// -----// IR Dump After Canonicalizer (canonicalize) //----- // +vm.func private @__multiple_results_memoize_apply() -> !vm.ref attributes {inlining_policy = #util.inline.never} { + %c13 = vm.const.i32 13 + %c28 = vm.const.i32 28 + %c2 = vm.const.i32 2 + %null = vm.const.ref.zero : !vm.ref + %c1 = vm.const.i32 1 + %c3 = vm.const.i32 3 + %c64 = vm.const.i32 64 + %c128 = vm.const.i64 128 + %c8 = vm.const.i64 8 + %zero = vm.const.i64.zero + %zero_0 = vm.const.i32.zero + %c-1 = vm.const.i64 -1 + %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref + %__device_0_executable_0_multiple_results_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref + %ref = vm.call @hal.command_buffer.create(%__device_0, %zero_0, %c3, %c-1, %c3) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref + vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%zero_0], [(%zero_0, %zero_0, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64, i32 ..., tuple, i64, i64> ...) + vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%c64], [(%zero_0, %c1, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64, i32 ..., tuple, i64, i64> ...) + vm.call @hal.command_buffer.execution_barrier(%ref, %c28, %c13, %zero) : (!vm.ref, i32, i32, i64) -> () + vm.call @hal.command_buffer.finalize(%ref) : (!vm.ref) -> () + vm.return %ref : !vm.ref +} + +// -----// IR Dump After Canonicalizer (canonicalize) //----- // +vm.initializer { + %c1 = vm.const.i32 1 + %null = vm.const.ref.zero : !vm.buffer + %c14 = vm.const.i32 14 + %c-1 = vm.const.i64 -1 + %c18 = vm.const.i32 18 + %zero = vm.const.i32.zero + %zero_0 = vm.const.i64.zero + %c1_1 = vm.const.i64 1 + %null_2 = vm.const.ref.zero : !vm.ref + %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 + %1 = vm.ext.i32.i64.s %0 : i32 -> i64 + vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref) +^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 + %rnz = vm.cmp.nz.ref %4 : !vm.ref + %5 = vm.xor.i32 %rnz, %c1 : i32 + %slt = vm.cmp.lt.i64.s %2, %1 : i64 + %6 = vm.and.i32 %5, %slt : i32 + vm.cond_br %6, ^bb2, ^bb5 +^bb2: // pred: ^bb1 + %7 = vm.trunc.i64.i32 %2 : i64 -> i32 + %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref + %_utf8_hal_device_id_C6650FF277232B5A = vm.const.ref.rodata @_utf8_hal_device_id_C6650FF277232B5A : !vm.buffer + %_utf8_local_1A8FF0278D7661D8 = vm.const.ref.rodata @_utf8_local_1A8FF0278D7661D8 : !vm.buffer + %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_C6650FF277232B5A, %_utf8_local_1A8FF0278D7661D8) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) + %nz = vm.cmp.nz.i64 %8#1 : i64 + %9 = vm.select.i32 %8#0, %nz, %zero : i32 + vm.cond_br %9, ^bb3, ^bb4(%zero : i32) +^bb3: // pred: ^bb2 + %_utf8_hal_executable_format_E03EECB63A2AAF52 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer + %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer + %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_E03EECB63A2AAF52, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) + %nz_3 = vm.cmp.nz.i64 %10#1 : i64 + %11 = vm.select.i32 %10#0, %nz_3, %zero : i32 + vm.br ^bb4(%11 : i32) +^bb4(%12: i32): // 2 preds: ^bb2, ^bb3 + %eq = vm.cmp.eq.i64 %3, %zero_0 : i64 + %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64 + %14 = vm.add.i64 %3, %13 : i64 + %15 = vm.and.i32 %12, %eq : i32 + %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref + %16 = vm.add.i64 %2, %c1_1 : i64 + vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref) +^bb5: // pred: ^bb1 + vm.cond_br %5, ^bb6, ^bb7 +^bb6: // pred: ^bb5 + vm.fail %c18, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>" +^bb7: // pred: ^bb5 + %_utf8_hal_executable_format_E03EECB63A2AAF52_5 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer + %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer + %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_E03EECB63A2AAF52_5, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) + %nz_7 = vm.cmp.nz.i64 %17#1 : i64 + %18 = vm.select.i32 %17#0, %nz_7, %zero : i32 + %19 = vm.select.i64 %18, %zero_0, %c-1 : i64 + %eq_8 = vm.cmp.eq.i64 %19, %zero_0 : i64 + vm.global.store.ref %4, @__device_0 : !vm.ref + vm.cond_br %eq_8, ^bb8, ^bb9 +^bb8: // pred: ^bb7 + %multiple_results_dispatch_0_embedded_elf_arm_64 = vm.const.ref.rodata @multiple_results_dispatch_0_embedded_elf_arm_64 : !vm.buffer + %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_9 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer + %ref_10 = vm.call @hal.executable.create(%4, %c-1, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_9, %multiple_results_dispatch_0_embedded_elf_arm_64, %null) {nosideeffects} : (!vm.ref, i64, !vm.buffer, !vm.buffer, !vm.buffer) -> !vm.ref + vm.global.store.ref %ref_10, @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref + %ref_11 = vm.call @__multiple_results_memoize_apply() : () -> !vm.ref + vm.global.store.ref %ref_11, @__multiple_results_memoize_result_0_device_0 : !vm.ref + vm.return +^bb9: // pred: ^bb7 + vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" +} + +// -----// IR Dump After Canonicalizer (canonicalize) //----- // +vm.func private @multiple_results(%arg0: !vm.ref, %arg1: !vm.ref) -> (!vm.ref, !vm.ref) attributes {iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c16 = vm.const.i32 16 + %c1 = vm.const.i32 1 + %c553648160 = vm.const.i32 553648160 + %c3075 = vm.const.i32 3075 + %c48 = vm.const.i32 48 + %c2 = vm.const.i64 2 + %c8 = vm.const.i64 8 + %c64 = vm.const.i64 64 + %c128 = vm.const.i64 128 + %zero = vm.const.i64.zero + %c-1 = vm.const.i64 -1 + %null = vm.const.ref.zero : !vm.ref + %c-1_0 = vm.const.i32 -1 + %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref + %__multiple_results_memoize_result_0_device_0 = vm.global.load.ref immutable @__multiple_results_memoize_result_0_device_0 : !vm.ref + %_utf8_input0_DCE99660CEB3F6B = vm.const.ref.rodata @_utf8_input0_DCE99660CEB3F6B : !vm.buffer + vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DCE99660CEB3F6B, %c553648160, %c1, [%c2]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) + %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref + %ref_1 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref + %_utf8_tensor_FC1814BC4A58F22A = vm.const.ref.rodata @_utf8_tensor_FC1814BC4A58F22A : !vm.buffer + vm.call @hal.buffer.assert(%ref, %_utf8_tensor_FC1814BC4A58F22A, %ref_1, %c8, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () + %_utf8_input1_B898B726583C85DA = vm.const.ref.rodata @_utf8_input1_B898B726583C85DA : !vm.buffer + vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_B898B726583C85DA, %c553648160, %c1, [%c2]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) + %ref_2 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref + %_utf8_tensor_FC1814BC4A58F22A_3 = vm.const.ref.rodata @_utf8_tensor_FC1814BC4A58F22A : !vm.buffer + vm.call @hal.buffer.assert(%ref_2, %_utf8_tensor_FC1814BC4A58F22A_3, %ref_1, %c8, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () + %ref_4 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i64) -> !vm.ref + %ref_5 = vm.call @hal.device.queue.alloca(%__device_0, %c-1, %null, %ref_4, %zero, %c48, %c3075, %c128, %zero) : (!vm.ref, i64, !vm.ref, !vm.ref, i64, i32, i32, i64, i64) -> !vm.ref + %ref_6 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i64) -> !vm.ref + vm.call.variadic @hal.device.queue.execute.indirect(%__device_0, %c-1, %ref_4, %ref_6, %__multiple_results_memoize_result_0_device_0, %zero, [(%ref, %zero, %c8), (%ref_2, %zero, %c8), (%ref_5, %zero, %c128)]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref, i64, tuple, i64, i64> ...) + %0 = vm.call.variadic @hal.fence.await(%c-1_0, %zero, [%ref_6]) : (i32, i64, !vm.ref ...) -> i32 + vm.cond_br %0, ^bb2(%0 : i32), ^bb1 +^bb1: // pred: ^bb0 + %ref_7 = vm.call.variadic @hal.buffer_view.create(%ref_5, %zero, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref + %ref_8 = vm.call.variadic @hal.buffer_view.create(%ref_5, %c64, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref + vm.return %ref_7, %ref_8 : !vm.ref, !vm.ref +^bb2(%1: i32): // pred: ^bb0 + vm.fail %1, "failed to wait on timepoint" +} + +// -----// IR Dump After Inliner (inline) //----- // +module attributes {vm.toplevel} { + vm.module public @module { + vm.global.ref private @__device_0 : !vm.ref + vm.global.ref private @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref + vm.global.ref private @__multiple_results_memoize_result_0_device_0 : !vm.ref + vm.rodata private @_utf8_hal_device_id_C6650FF277232B5A {alignment = 1 : i64} "hal.device.id" + vm.rodata private @_utf8_local_1A8FF0278D7661D8 {alignment = 1 : i64} "local*" + vm.rodata private @_utf8_hal_executable_format_E03EECB63A2AAF52 {alignment = 1 : i64} "hal.executable.format" + vm.rodata private @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 {alignment = 1 : i64} "embedded-elf-arm_64" + vm.rodata private @multiple_results_dispatch_0_embedded_elf_arm_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8> + vm.initializer { + %c1 = vm.const.i32 1 + %null = vm.const.ref.zero : !vm.buffer + %c14 = vm.const.i32 14 + %c-1 = vm.const.i64 -1 + %c18 = vm.const.i32 18 + %zero = vm.const.i32.zero + %zero_0 = vm.const.i64.zero + %c1_1 = vm.const.i64 1 + %null_2 = vm.const.ref.zero : !vm.ref + %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 + %1 = vm.ext.i32.i64.s %0 : i32 -> i64 + vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref) + ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 + %rnz = vm.cmp.nz.ref %4 : !vm.ref + %5 = vm.xor.i32 %rnz, %c1 : i32 + %slt = vm.cmp.lt.i64.s %2, %1 : i64 + %6 = vm.and.i32 %5, %slt : i32 + vm.cond_br %6, ^bb2, ^bb5 + ^bb2: // pred: ^bb1 + %7 = vm.trunc.i64.i32 %2 : i64 -> i32 + %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref + %_utf8_hal_device_id_C6650FF277232B5A = vm.const.ref.rodata @_utf8_hal_device_id_C6650FF277232B5A : !vm.buffer + %_utf8_local_1A8FF0278D7661D8 = vm.const.ref.rodata @_utf8_local_1A8FF0278D7661D8 : !vm.buffer + %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_C6650FF277232B5A, %_utf8_local_1A8FF0278D7661D8) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) + %nz = vm.cmp.nz.i64 %8#1 : i64 + %9 = vm.select.i32 %8#0, %nz, %zero : i32 + vm.cond_br %9, ^bb3, ^bb4(%zero : i32) + ^bb3: // pred: ^bb2 + %_utf8_hal_executable_format_E03EECB63A2AAF52 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer + %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer + %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_E03EECB63A2AAF52, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) + %nz_3 = vm.cmp.nz.i64 %10#1 : i64 + %11 = vm.select.i32 %10#0, %nz_3, %zero : i32 + vm.br ^bb4(%11 : i32) + ^bb4(%12: i32): // 2 preds: ^bb2, ^bb3 + %eq = vm.cmp.eq.i64 %3, %zero_0 : i64 + %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64 + %14 = vm.add.i64 %3, %13 : i64 + %15 = vm.and.i32 %12, %eq : i32 + %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref + %16 = vm.add.i64 %2, %c1_1 : i64 + vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref) + ^bb5: // pred: ^bb1 + vm.cond_br %5, ^bb6, ^bb7 + ^bb6: // pred: ^bb5 + vm.fail %c18, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>" + ^bb7: // pred: ^bb5 + %_utf8_hal_executable_format_E03EECB63A2AAF52_5 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer + %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer + %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_E03EECB63A2AAF52_5, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) + %nz_7 = vm.cmp.nz.i64 %17#1 : i64 + %18 = vm.select.i32 %17#0, %nz_7, %zero : i32 + %19 = vm.select.i64 %18, %zero_0, %c-1 : i64 + %eq_8 = vm.cmp.eq.i64 %19, %zero_0 : i64 + vm.global.store.ref %4, @__device_0 : !vm.ref + vm.cond_br %eq_8, ^bb8, ^bb9 + ^bb8: // pred: ^bb7 + %multiple_results_dispatch_0_embedded_elf_arm_64 = vm.const.ref.rodata @multiple_results_dispatch_0_embedded_elf_arm_64 : !vm.buffer + %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_9 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer + %ref_10 = vm.call @hal.executable.create(%4, %c-1, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_9, %multiple_results_dispatch_0_embedded_elf_arm_64, %null) {nosideeffects} : (!vm.ref, i64, !vm.buffer, !vm.buffer, !vm.buffer) -> !vm.ref + vm.global.store.ref %ref_10, @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref + %ref_11 = vm.call @__multiple_results_memoize_apply() : () -> !vm.ref + vm.global.store.ref %ref_11, @__multiple_results_memoize_result_0_device_0 : !vm.ref + vm.return + ^bb9: // pred: ^bb7 + vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + } + vm.func private @__multiple_results_memoize_apply() -> !vm.ref attributes {inlining_policy = #util.inline.never} { + %c13 = vm.const.i32 13 + %c28 = vm.const.i32 28 + %c2 = vm.const.i32 2 + %null = vm.const.ref.zero : !vm.ref + %c1 = vm.const.i32 1 + %c3 = vm.const.i32 3 + %c64 = vm.const.i32 64 + %c128 = vm.const.i64 128 + %c8 = vm.const.i64 8 + %zero = vm.const.i64.zero + %zero_0 = vm.const.i32.zero + %c-1 = vm.const.i64 -1 + %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref + %__device_0_executable_0_multiple_results_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref + %ref = vm.call @hal.command_buffer.create(%__device_0, %zero_0, %c3, %c-1, %c3) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref + vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%zero_0], [(%zero_0, %zero_0, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64, i32 ..., tuple, i64, i64> ...) + vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%c64], [(%zero_0, %c1, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64, i32 ..., tuple, i64, i64> ...) + vm.call @hal.command_buffer.execution_barrier(%ref, %c28, %c13, %zero) : (!vm.ref, i32, i32, i64) -> () + vm.call @hal.command_buffer.finalize(%ref) : (!vm.ref) -> () + vm.return %ref : !vm.ref + } + vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) + vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) + vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 6 : i32} + vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) + vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i64) + vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple, i64, i64> ...) + vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects} + vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64, %flags : i64) -> !vm.ref + vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffer : !vm.ref, %flags : i64, %binding_table : tuple, i64, i64> ...) + vm.import private @hal.devices.count() -> i32 attributes {nosideeffects} + vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.executable.create(%device : !vm.ref, %queue_affinity : i64, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.fence.create(%device : !vm.ref, %flags : i64) -> !vm.ref + vm.import private @hal.fence.await(%timeout_millis : i32, %flags : i64, %fences : !vm.ref ...) -> i32 attributes {vm.yield} + vm.rodata private @_utf8_input0_DCE99660CEB3F6B {alignment = 1 : i64} "input0" + vm.rodata private @_utf8_tensor_FC1814BC4A58F22A {alignment = 1 : i64} "tensor" + vm.rodata private @_utf8_input1_B898B726583C85DA {alignment = 1 : i64} "input1" + vm.func private @multiple_results(%arg0: !vm.ref, %arg1: !vm.ref) -> (!vm.ref, !vm.ref) attributes {iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c16 = vm.const.i32 16 + %c1 = vm.const.i32 1 + %c553648160 = vm.const.i32 553648160 + %c3075 = vm.const.i32 3075 + %c48 = vm.const.i32 48 + %c2 = vm.const.i64 2 + %c8 = vm.const.i64 8 + %c64 = vm.const.i64 64 + %c128 = vm.const.i64 128 + %zero = vm.const.i64.zero + %c-1 = vm.const.i64 -1 + %null = vm.const.ref.zero : !vm.ref + %c-1_0 = vm.const.i32 -1 + %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref + %__multiple_results_memoize_result_0_device_0 = vm.global.load.ref immutable @__multiple_results_memoize_result_0_device_0 : !vm.ref + %_utf8_input0_DCE99660CEB3F6B = vm.const.ref.rodata @_utf8_input0_DCE99660CEB3F6B : !vm.buffer + vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DCE99660CEB3F6B, %c553648160, %c1, [%c2]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) + %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref + %ref_1 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref + %_utf8_tensor_FC1814BC4A58F22A = vm.const.ref.rodata @_utf8_tensor_FC1814BC4A58F22A : !vm.buffer + vm.call @hal.buffer.assert(%ref, %_utf8_tensor_FC1814BC4A58F22A, %ref_1, %c8, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () + %_utf8_input1_B898B726583C85DA = vm.const.ref.rodata @_utf8_input1_B898B726583C85DA : !vm.buffer + vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_B898B726583C85DA, %c553648160, %c1, [%c2]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) + %ref_2 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref + %_utf8_tensor_FC1814BC4A58F22A_3 = vm.const.ref.rodata @_utf8_tensor_FC1814BC4A58F22A : !vm.buffer + vm.call @hal.buffer.assert(%ref_2, %_utf8_tensor_FC1814BC4A58F22A_3, %ref_1, %c8, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () + %ref_4 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i64) -> !vm.ref + %ref_5 = vm.call @hal.device.queue.alloca(%__device_0, %c-1, %null, %ref_4, %zero, %c48, %c3075, %c128, %zero) : (!vm.ref, i64, !vm.ref, !vm.ref, i64, i32, i32, i64, i64) -> !vm.ref + %ref_6 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i64) -> !vm.ref + vm.call.variadic @hal.device.queue.execute.indirect(%__device_0, %c-1, %ref_4, %ref_6, %__multiple_results_memoize_result_0_device_0, %zero, [(%ref, %zero, %c8), (%ref_2, %zero, %c8), (%ref_5, %zero, %c128)]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref, i64, tuple, i64, i64> ...) + %0 = vm.call.variadic @hal.fence.await(%c-1_0, %zero, [%ref_6]) : (i32, i64, !vm.ref ...) -> i32 + vm.cond_br %0, ^bb2(%0 : i32), ^bb1 + ^bb1: // pred: ^bb0 + %ref_7 = vm.call.variadic @hal.buffer_view.create(%ref_5, %zero, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref + %ref_8 = vm.call.variadic @hal.buffer_view.create(%ref_5, %c64, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref + vm.return %ref_7, %ref_8 : !vm.ref, !vm.ref + ^bb2(%1: i32): // pred: ^bb0 + vm.fail %1, "failed to wait on timepoint" + } + vm.export @multiple_results attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} + } +} + + +// -----// IR Dump After SymbolDCE (symbol-dce) //----- // +module attributes {vm.toplevel} { + vm.module public @module { + vm.global.ref private @__device_0 : !vm.ref + vm.global.ref private @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref + vm.global.ref private @__multiple_results_memoize_result_0_device_0 : !vm.ref + vm.rodata private @_utf8_hal_device_id_C6650FF277232B5A {alignment = 1 : i64} "hal.device.id" + vm.rodata private @_utf8_local_1A8FF0278D7661D8 {alignment = 1 : i64} "local*" + vm.rodata private @_utf8_hal_executable_format_E03EECB63A2AAF52 {alignment = 1 : i64} "hal.executable.format" + vm.rodata private @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 {alignment = 1 : i64} "embedded-elf-arm_64" + vm.rodata private @multiple_results_dispatch_0_embedded_elf_arm_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8> + vm.initializer { + %c1 = vm.const.i32 1 + %null = vm.const.ref.zero : !vm.buffer + %c14 = vm.const.i32 14 + %c-1 = vm.const.i64 -1 + %c18 = vm.const.i32 18 + %zero = vm.const.i32.zero + %zero_0 = vm.const.i64.zero + %c1_1 = vm.const.i64 1 + %null_2 = vm.const.ref.zero : !vm.ref + %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 + %1 = vm.ext.i32.i64.s %0 : i32 -> i64 + vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref) + ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 + %rnz = vm.cmp.nz.ref %4 : !vm.ref + %5 = vm.xor.i32 %rnz, %c1 : i32 + %slt = vm.cmp.lt.i64.s %2, %1 : i64 + %6 = vm.and.i32 %5, %slt : i32 + vm.cond_br %6, ^bb2, ^bb5 + ^bb2: // pred: ^bb1 + %7 = vm.trunc.i64.i32 %2 : i64 -> i32 + %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref + %_utf8_hal_device_id_C6650FF277232B5A = vm.const.ref.rodata @_utf8_hal_device_id_C6650FF277232B5A : !vm.buffer + %_utf8_local_1A8FF0278D7661D8 = vm.const.ref.rodata @_utf8_local_1A8FF0278D7661D8 : !vm.buffer + %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_C6650FF277232B5A, %_utf8_local_1A8FF0278D7661D8) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) + %nz = vm.cmp.nz.i64 %8#1 : i64 + %9 = vm.select.i32 %8#0, %nz, %zero : i32 + vm.cond_br %9, ^bb3, ^bb4(%zero : i32) + ^bb3: // pred: ^bb2 + %_utf8_hal_executable_format_E03EECB63A2AAF52 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer + %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer + %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_E03EECB63A2AAF52, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) + %nz_3 = vm.cmp.nz.i64 %10#1 : i64 + %11 = vm.select.i32 %10#0, %nz_3, %zero : i32 + vm.br ^bb4(%11 : i32) + ^bb4(%12: i32): // 2 preds: ^bb2, ^bb3 + %eq = vm.cmp.eq.i64 %3, %zero_0 : i64 + %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64 + %14 = vm.add.i64 %3, %13 : i64 + %15 = vm.and.i32 %12, %eq : i32 + %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref + %16 = vm.add.i64 %2, %c1_1 : i64 + vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref) + ^bb5: // pred: ^bb1 + vm.cond_br %5, ^bb6, ^bb7 + ^bb6: // pred: ^bb5 + vm.fail %c18, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>" + ^bb7: // pred: ^bb5 + %_utf8_hal_executable_format_E03EECB63A2AAF52_5 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer + %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer + %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_E03EECB63A2AAF52_5, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) + %nz_7 = vm.cmp.nz.i64 %17#1 : i64 + %18 = vm.select.i32 %17#0, %nz_7, %zero : i32 + %19 = vm.select.i64 %18, %zero_0, %c-1 : i64 + %eq_8 = vm.cmp.eq.i64 %19, %zero_0 : i64 + vm.global.store.ref %4, @__device_0 : !vm.ref + vm.cond_br %eq_8, ^bb8, ^bb9 + ^bb8: // pred: ^bb7 + %multiple_results_dispatch_0_embedded_elf_arm_64 = vm.const.ref.rodata @multiple_results_dispatch_0_embedded_elf_arm_64 : !vm.buffer + %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_9 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer + %ref_10 = vm.call @hal.executable.create(%4, %c-1, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_9, %multiple_results_dispatch_0_embedded_elf_arm_64, %null) {nosideeffects} : (!vm.ref, i64, !vm.buffer, !vm.buffer, !vm.buffer) -> !vm.ref + vm.global.store.ref %ref_10, @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref + %ref_11 = vm.call @__multiple_results_memoize_apply() : () -> !vm.ref + vm.global.store.ref %ref_11, @__multiple_results_memoize_result_0_device_0 : !vm.ref + vm.return + ^bb9: // pred: ^bb7 + vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + } + vm.func private @__multiple_results_memoize_apply() -> !vm.ref attributes {inlining_policy = #util.inline.never} { + %c13 = vm.const.i32 13 + %c28 = vm.const.i32 28 + %c2 = vm.const.i32 2 + %null = vm.const.ref.zero : !vm.ref + %c1 = vm.const.i32 1 + %c3 = vm.const.i32 3 + %c64 = vm.const.i32 64 + %c128 = vm.const.i64 128 + %c8 = vm.const.i64 8 + %zero = vm.const.i64.zero + %zero_0 = vm.const.i32.zero + %c-1 = vm.const.i64 -1 + %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref + %__device_0_executable_0_multiple_results_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref + %ref = vm.call @hal.command_buffer.create(%__device_0, %zero_0, %c3, %c-1, %c3) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref + vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%zero_0], [(%zero_0, %zero_0, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64, i32 ..., tuple, i64, i64> ...) + vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%c64], [(%zero_0, %c1, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64, i32 ..., tuple, i64, i64> ...) + vm.call @hal.command_buffer.execution_barrier(%ref, %c28, %c13, %zero) : (!vm.ref, i32, i32, i64) -> () + vm.call @hal.command_buffer.finalize(%ref) : (!vm.ref) -> () + vm.return %ref : !vm.ref + } + vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) + vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) + vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 6 : i32} + vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) + vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i64) + vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple, i64, i64> ...) + vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects} + vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64, %flags : i64) -> !vm.ref + vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffer : !vm.ref, %flags : i64, %binding_table : tuple, i64, i64> ...) + vm.import private @hal.devices.count() -> i32 attributes {nosideeffects} + vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.executable.create(%device : !vm.ref, %queue_affinity : i64, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.fence.create(%device : !vm.ref, %flags : i64) -> !vm.ref + vm.import private @hal.fence.await(%timeout_millis : i32, %flags : i64, %fences : !vm.ref ...) -> i32 attributes {vm.yield} + vm.rodata private @_utf8_input0_DCE99660CEB3F6B {alignment = 1 : i64} "input0" + vm.rodata private @_utf8_tensor_FC1814BC4A58F22A {alignment = 1 : i64} "tensor" + vm.rodata private @_utf8_input1_B898B726583C85DA {alignment = 1 : i64} "input1" + vm.func private @multiple_results(%arg0: !vm.ref, %arg1: !vm.ref) -> (!vm.ref, !vm.ref) attributes {iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c16 = vm.const.i32 16 + %c1 = vm.const.i32 1 + %c553648160 = vm.const.i32 553648160 + %c3075 = vm.const.i32 3075 + %c48 = vm.const.i32 48 + %c2 = vm.const.i64 2 + %c8 = vm.const.i64 8 + %c64 = vm.const.i64 64 + %c128 = vm.const.i64 128 + %zero = vm.const.i64.zero + %c-1 = vm.const.i64 -1 + %null = vm.const.ref.zero : !vm.ref + %c-1_0 = vm.const.i32 -1 + %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref + %__multiple_results_memoize_result_0_device_0 = vm.global.load.ref immutable @__multiple_results_memoize_result_0_device_0 : !vm.ref + %_utf8_input0_DCE99660CEB3F6B = vm.const.ref.rodata @_utf8_input0_DCE99660CEB3F6B : !vm.buffer + vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DCE99660CEB3F6B, %c553648160, %c1, [%c2]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) + %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref + %ref_1 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref + %_utf8_tensor_FC1814BC4A58F22A = vm.const.ref.rodata @_utf8_tensor_FC1814BC4A58F22A : !vm.buffer + vm.call @hal.buffer.assert(%ref, %_utf8_tensor_FC1814BC4A58F22A, %ref_1, %c8, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () + %_utf8_input1_B898B726583C85DA = vm.const.ref.rodata @_utf8_input1_B898B726583C85DA : !vm.buffer + vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_B898B726583C85DA, %c553648160, %c1, [%c2]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) + %ref_2 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref + %_utf8_tensor_FC1814BC4A58F22A_3 = vm.const.ref.rodata @_utf8_tensor_FC1814BC4A58F22A : !vm.buffer + vm.call @hal.buffer.assert(%ref_2, %_utf8_tensor_FC1814BC4A58F22A_3, %ref_1, %c8, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () + %ref_4 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i64) -> !vm.ref + %ref_5 = vm.call @hal.device.queue.alloca(%__device_0, %c-1, %null, %ref_4, %zero, %c48, %c3075, %c128, %zero) : (!vm.ref, i64, !vm.ref, !vm.ref, i64, i32, i32, i64, i64) -> !vm.ref + %ref_6 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i64) -> !vm.ref + vm.call.variadic @hal.device.queue.execute.indirect(%__device_0, %c-1, %ref_4, %ref_6, %__multiple_results_memoize_result_0_device_0, %zero, [(%ref, %zero, %c8), (%ref_2, %zero, %c8), (%ref_5, %zero, %c128)]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref, i64, tuple, i64, i64> ...) + %0 = vm.call.variadic @hal.fence.await(%c-1_0, %zero, [%ref_6]) : (i32, i64, !vm.ref ...) -> i32 + vm.cond_br %0, ^bb2(%0 : i32), ^bb1 + ^bb1: // pred: ^bb0 + %ref_7 = vm.call.variadic @hal.buffer_view.create(%ref_5, %zero, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref + %ref_8 = vm.call.variadic @hal.buffer_view.create(%ref_5, %c64, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref + vm.return %ref_7, %ref_8 : !vm.ref, !vm.ref + ^bb2(%1: i32): // pred: ^bb0 + vm.fail %1, "failed to wait on timepoint" + } + vm.export @multiple_results attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} + } +} + + +// -----// IR Dump After DropUnusedCallsPass (iree-vm-drop-unused-calls) //----- // +vm.module public @module { + vm.global.ref private @__device_0 : !vm.ref + vm.global.ref private @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref + vm.global.ref private @__multiple_results_memoize_result_0_device_0 : !vm.ref + vm.rodata private @_utf8_hal_device_id_C6650FF277232B5A {alignment = 1 : i64} "hal.device.id" + vm.rodata private @_utf8_local_1A8FF0278D7661D8 {alignment = 1 : i64} "local*" + vm.rodata private @_utf8_hal_executable_format_E03EECB63A2AAF52 {alignment = 1 : i64} "hal.executable.format" + vm.rodata private @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 {alignment = 1 : i64} "embedded-elf-arm_64" + vm.rodata private @multiple_results_dispatch_0_embedded_elf_arm_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8> + vm.initializer { + %c1 = vm.const.i32 1 + %null = vm.const.ref.zero : !vm.buffer + %c14 = vm.const.i32 14 + %c-1 = vm.const.i64 -1 + %c18 = vm.const.i32 18 + %zero = vm.const.i32.zero + %zero_0 = vm.const.i64.zero + %c1_1 = vm.const.i64 1 + %null_2 = vm.const.ref.zero : !vm.ref + %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 + %1 = vm.ext.i32.i64.s %0 : i32 -> i64 + vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref) + ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 + %rnz = vm.cmp.nz.ref %4 : !vm.ref + %5 = vm.xor.i32 %rnz, %c1 : i32 + %slt = vm.cmp.lt.i64.s %2, %1 : i64 + %6 = vm.and.i32 %5, %slt : i32 + vm.cond_br %6, ^bb2, ^bb5 + ^bb2: // pred: ^bb1 + %7 = vm.trunc.i64.i32 %2 : i64 -> i32 + %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref + %_utf8_hal_device_id_C6650FF277232B5A = vm.const.ref.rodata @_utf8_hal_device_id_C6650FF277232B5A : !vm.buffer + %_utf8_local_1A8FF0278D7661D8 = vm.const.ref.rodata @_utf8_local_1A8FF0278D7661D8 : !vm.buffer + %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_C6650FF277232B5A, %_utf8_local_1A8FF0278D7661D8) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) + %nz = vm.cmp.nz.i64 %8#1 : i64 + %9 = vm.select.i32 %8#0, %nz, %zero : i32 + vm.cond_br %9, ^bb3, ^bb4(%zero : i32) + ^bb3: // pred: ^bb2 + %_utf8_hal_executable_format_E03EECB63A2AAF52 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer + %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer + %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_E03EECB63A2AAF52, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) + %nz_3 = vm.cmp.nz.i64 %10#1 : i64 + %11 = vm.select.i32 %10#0, %nz_3, %zero : i32 + vm.br ^bb4(%11 : i32) + ^bb4(%12: i32): // 2 preds: ^bb2, ^bb3 + %eq = vm.cmp.eq.i64 %3, %zero_0 : i64 + %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64 + %14 = vm.add.i64 %3, %13 : i64 + %15 = vm.and.i32 %12, %eq : i32 + %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref + %16 = vm.add.i64 %2, %c1_1 : i64 + vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref) + ^bb5: // pred: ^bb1 + vm.cond_br %5, ^bb6, ^bb7 + ^bb6: // pred: ^bb5 + vm.fail %c18, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>" + ^bb7: // pred: ^bb5 + %_utf8_hal_executable_format_E03EECB63A2AAF52_5 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer + %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer + %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_E03EECB63A2AAF52_5, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) + %nz_7 = vm.cmp.nz.i64 %17#1 : i64 + %18 = vm.select.i32 %17#0, %nz_7, %zero : i32 + %19 = vm.select.i64 %18, %zero_0, %c-1 : i64 + %eq_8 = vm.cmp.eq.i64 %19, %zero_0 : i64 + vm.global.store.ref %4, @__device_0 : !vm.ref + vm.cond_br %eq_8, ^bb8, ^bb9 + ^bb8: // pred: ^bb7 + %multiple_results_dispatch_0_embedded_elf_arm_64 = vm.const.ref.rodata @multiple_results_dispatch_0_embedded_elf_arm_64 : !vm.buffer + %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_9 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer + %ref_10 = vm.call @hal.executable.create(%4, %c-1, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_9, %multiple_results_dispatch_0_embedded_elf_arm_64, %null) {nosideeffects} : (!vm.ref, i64, !vm.buffer, !vm.buffer, !vm.buffer) -> !vm.ref + vm.global.store.ref %ref_10, @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref + %ref_11 = vm.call @__multiple_results_memoize_apply() : () -> !vm.ref + vm.global.store.ref %ref_11, @__multiple_results_memoize_result_0_device_0 : !vm.ref + vm.return + ^bb9: // pred: ^bb7 + vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + } + vm.func private @__multiple_results_memoize_apply() -> !vm.ref attributes {inlining_policy = #util.inline.never} { + %c13 = vm.const.i32 13 + %c28 = vm.const.i32 28 + %c2 = vm.const.i32 2 + %null = vm.const.ref.zero : !vm.ref + %c1 = vm.const.i32 1 + %c3 = vm.const.i32 3 + %c64 = vm.const.i32 64 + %c128 = vm.const.i64 128 + %c8 = vm.const.i64 8 + %zero = vm.const.i64.zero + %zero_0 = vm.const.i32.zero + %c-1 = vm.const.i64 -1 + %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref + %__device_0_executable_0_multiple_results_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref + %ref = vm.call @hal.command_buffer.create(%__device_0, %zero_0, %c3, %c-1, %c3) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref + vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%zero_0], [(%zero_0, %zero_0, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64, i32 ..., tuple, i64, i64> ...) + vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%c64], [(%zero_0, %c1, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64, i32 ..., tuple, i64, i64> ...) + vm.call @hal.command_buffer.execution_barrier(%ref, %c28, %c13, %zero) : (!vm.ref, i32, i32, i64) -> () + vm.call @hal.command_buffer.finalize(%ref) : (!vm.ref) -> () + vm.return %ref : !vm.ref + } + vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) + vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) + vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 6 : i32} + vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) + vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i64) + vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple, i64, i64> ...) + vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects} + vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64, %flags : i64) -> !vm.ref + vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffer : !vm.ref, %flags : i64, %binding_table : tuple, i64, i64> ...) + vm.import private @hal.devices.count() -> i32 attributes {nosideeffects} + vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.executable.create(%device : !vm.ref, %queue_affinity : i64, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.fence.create(%device : !vm.ref, %flags : i64) -> !vm.ref + vm.import private @hal.fence.await(%timeout_millis : i32, %flags : i64, %fences : !vm.ref ...) -> i32 attributes {vm.yield} + vm.rodata private @_utf8_input0_DCE99660CEB3F6B {alignment = 1 : i64} "input0" + vm.rodata private @_utf8_tensor_FC1814BC4A58F22A {alignment = 1 : i64} "tensor" + vm.rodata private @_utf8_input1_B898B726583C85DA {alignment = 1 : i64} "input1" + vm.func private @multiple_results(%arg0: !vm.ref, %arg1: !vm.ref) -> (!vm.ref, !vm.ref) attributes {iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c16 = vm.const.i32 16 + %c1 = vm.const.i32 1 + %c553648160 = vm.const.i32 553648160 + %c3075 = vm.const.i32 3075 + %c48 = vm.const.i32 48 + %c2 = vm.const.i64 2 + %c8 = vm.const.i64 8 + %c64 = vm.const.i64 64 + %c128 = vm.const.i64 128 + %zero = vm.const.i64.zero + %c-1 = vm.const.i64 -1 + %null = vm.const.ref.zero : !vm.ref + %c-1_0 = vm.const.i32 -1 + %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref + %__multiple_results_memoize_result_0_device_0 = vm.global.load.ref immutable @__multiple_results_memoize_result_0_device_0 : !vm.ref + %_utf8_input0_DCE99660CEB3F6B = vm.const.ref.rodata @_utf8_input0_DCE99660CEB3F6B : !vm.buffer + vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DCE99660CEB3F6B, %c553648160, %c1, [%c2]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) + %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref + %ref_1 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref + %_utf8_tensor_FC1814BC4A58F22A = vm.const.ref.rodata @_utf8_tensor_FC1814BC4A58F22A : !vm.buffer + vm.call @hal.buffer.assert(%ref, %_utf8_tensor_FC1814BC4A58F22A, %ref_1, %c8, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () + %_utf8_input1_B898B726583C85DA = vm.const.ref.rodata @_utf8_input1_B898B726583C85DA : !vm.buffer + vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_B898B726583C85DA, %c553648160, %c1, [%c2]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) + %ref_2 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref + %_utf8_tensor_FC1814BC4A58F22A_3 = vm.const.ref.rodata @_utf8_tensor_FC1814BC4A58F22A : !vm.buffer + vm.call @hal.buffer.assert(%ref_2, %_utf8_tensor_FC1814BC4A58F22A_3, %ref_1, %c8, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () + %ref_4 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i64) -> !vm.ref + %ref_5 = vm.call @hal.device.queue.alloca(%__device_0, %c-1, %null, %ref_4, %zero, %c48, %c3075, %c128, %zero) : (!vm.ref, i64, !vm.ref, !vm.ref, i64, i32, i32, i64, i64) -> !vm.ref + %ref_6 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i64) -> !vm.ref + vm.call.variadic @hal.device.queue.execute.indirect(%__device_0, %c-1, %ref_4, %ref_6, %__multiple_results_memoize_result_0_device_0, %zero, [(%ref, %zero, %c8), (%ref_2, %zero, %c8), (%ref_5, %zero, %c128)]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref, i64, tuple, i64, i64> ...) + %0 = vm.call.variadic @hal.fence.await(%c-1_0, %zero, [%ref_6]) : (i32, i64, !vm.ref ...) -> i32 + vm.cond_br %0, ^bb2, ^bb1 + ^bb1: // pred: ^bb0 + %ref_7 = vm.call.variadic @hal.buffer_view.create(%ref_5, %zero, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref + %ref_8 = vm.call.variadic @hal.buffer_view.create(%ref_5, %c64, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref + vm.return %ref_7, %ref_8 : !vm.ref, !vm.ref + ^bb2: // pred: ^bb0 + vm.fail %0, "failed to wait on timepoint" + } + vm.export @multiple_results attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} +} + +// -----// IR Dump After SymbolDCE (symbol-dce) //----- // +module attributes {vm.toplevel} { + vm.module public @module { + vm.global.ref private @__device_0 : !vm.ref + vm.global.ref private @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref + vm.global.ref private @__multiple_results_memoize_result_0_device_0 : !vm.ref + vm.rodata private @_utf8_hal_device_id_C6650FF277232B5A {alignment = 1 : i64} "hal.device.id" + vm.rodata private @_utf8_local_1A8FF0278D7661D8 {alignment = 1 : i64} "local*" + vm.rodata private @_utf8_hal_executable_format_E03EECB63A2AAF52 {alignment = 1 : i64} "hal.executable.format" + vm.rodata private @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 {alignment = 1 : i64} "embedded-elf-arm_64" + vm.rodata private @multiple_results_dispatch_0_embedded_elf_arm_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8> + vm.initializer { + %c1 = vm.const.i32 1 + %null = vm.const.ref.zero : !vm.buffer + %c14 = vm.const.i32 14 + %c-1 = vm.const.i64 -1 + %c18 = vm.const.i32 18 + %zero = vm.const.i32.zero + %zero_0 = vm.const.i64.zero + %c1_1 = vm.const.i64 1 + %null_2 = vm.const.ref.zero : !vm.ref + %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 + %1 = vm.ext.i32.i64.s %0 : i32 -> i64 + vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref) + ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 + %rnz = vm.cmp.nz.ref %4 : !vm.ref + %5 = vm.xor.i32 %rnz, %c1 : i32 + %slt = vm.cmp.lt.i64.s %2, %1 : i64 + %6 = vm.and.i32 %5, %slt : i32 + vm.cond_br %6, ^bb2, ^bb5 + ^bb2: // pred: ^bb1 + %7 = vm.trunc.i64.i32 %2 : i64 -> i32 + %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref + %_utf8_hal_device_id_C6650FF277232B5A = vm.const.ref.rodata @_utf8_hal_device_id_C6650FF277232B5A : !vm.buffer + %_utf8_local_1A8FF0278D7661D8 = vm.const.ref.rodata @_utf8_local_1A8FF0278D7661D8 : !vm.buffer + %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_C6650FF277232B5A, %_utf8_local_1A8FF0278D7661D8) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) + %nz = vm.cmp.nz.i64 %8#1 : i64 + %9 = vm.select.i32 %8#0, %nz, %zero : i32 + vm.cond_br %9, ^bb3, ^bb4(%zero : i32) + ^bb3: // pred: ^bb2 + %_utf8_hal_executable_format_E03EECB63A2AAF52 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer + %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer + %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_E03EECB63A2AAF52, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) + %nz_3 = vm.cmp.nz.i64 %10#1 : i64 + %11 = vm.select.i32 %10#0, %nz_3, %zero : i32 + vm.br ^bb4(%11 : i32) + ^bb4(%12: i32): // 2 preds: ^bb2, ^bb3 + %eq = vm.cmp.eq.i64 %3, %zero_0 : i64 + %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64 + %14 = vm.add.i64 %3, %13 : i64 + %15 = vm.and.i32 %12, %eq : i32 + %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref + %16 = vm.add.i64 %2, %c1_1 : i64 + vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref) + ^bb5: // pred: ^bb1 + vm.cond_br %5, ^bb6, ^bb7 + ^bb6: // pred: ^bb5 + vm.fail %c18, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>" + ^bb7: // pred: ^bb5 + %_utf8_hal_executable_format_E03EECB63A2AAF52_5 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer + %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer + %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_E03EECB63A2AAF52_5, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) + %nz_7 = vm.cmp.nz.i64 %17#1 : i64 + %18 = vm.select.i32 %17#0, %nz_7, %zero : i32 + %19 = vm.select.i64 %18, %zero_0, %c-1 : i64 + %eq_8 = vm.cmp.eq.i64 %19, %zero_0 : i64 + vm.global.store.ref %4, @__device_0 : !vm.ref + vm.cond_br %eq_8, ^bb8, ^bb9 + ^bb8: // pred: ^bb7 + %multiple_results_dispatch_0_embedded_elf_arm_64 = vm.const.ref.rodata @multiple_results_dispatch_0_embedded_elf_arm_64 : !vm.buffer + %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_9 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer + %ref_10 = vm.call @hal.executable.create(%4, %c-1, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_9, %multiple_results_dispatch_0_embedded_elf_arm_64, %null) {nosideeffects} : (!vm.ref, i64, !vm.buffer, !vm.buffer, !vm.buffer) -> !vm.ref + vm.global.store.ref %ref_10, @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref + %ref_11 = vm.call @__multiple_results_memoize_apply() : () -> !vm.ref + vm.global.store.ref %ref_11, @__multiple_results_memoize_result_0_device_0 : !vm.ref + vm.return + ^bb9: // pred: ^bb7 + vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + } + vm.func private @__multiple_results_memoize_apply() -> !vm.ref attributes {inlining_policy = #util.inline.never} { + %c13 = vm.const.i32 13 + %c28 = vm.const.i32 28 + %c2 = vm.const.i32 2 + %null = vm.const.ref.zero : !vm.ref + %c1 = vm.const.i32 1 + %c3 = vm.const.i32 3 + %c64 = vm.const.i32 64 + %c128 = vm.const.i64 128 + %c8 = vm.const.i64 8 + %zero = vm.const.i64.zero + %zero_0 = vm.const.i32.zero + %c-1 = vm.const.i64 -1 + %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref + %__device_0_executable_0_multiple_results_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref + %ref = vm.call @hal.command_buffer.create(%__device_0, %zero_0, %c3, %c-1, %c3) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref + vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%zero_0], [(%zero_0, %zero_0, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64, i32 ..., tuple, i64, i64> ...) + vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%c64], [(%zero_0, %c1, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64, i32 ..., tuple, i64, i64> ...) + vm.call @hal.command_buffer.execution_barrier(%ref, %c28, %c13, %zero) : (!vm.ref, i32, i32, i64) -> () + vm.call @hal.command_buffer.finalize(%ref) : (!vm.ref) -> () + vm.return %ref : !vm.ref + } + vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) + vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) + vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 6 : i32} + vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) + vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i64) + vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple, i64, i64> ...) + vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects} + vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64, %flags : i64) -> !vm.ref + vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffer : !vm.ref, %flags : i64, %binding_table : tuple, i64, i64> ...) + vm.import private @hal.devices.count() -> i32 attributes {nosideeffects} + vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.executable.create(%device : !vm.ref, %queue_affinity : i64, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.fence.create(%device : !vm.ref, %flags : i64) -> !vm.ref + vm.import private @hal.fence.await(%timeout_millis : i32, %flags : i64, %fences : !vm.ref ...) -> i32 attributes {vm.yield} + vm.rodata private @_utf8_input0_DCE99660CEB3F6B {alignment = 1 : i64} "input0" + vm.rodata private @_utf8_tensor_FC1814BC4A58F22A {alignment = 1 : i64} "tensor" + vm.rodata private @_utf8_input1_B898B726583C85DA {alignment = 1 : i64} "input1" + vm.func private @multiple_results(%arg0: !vm.ref, %arg1: !vm.ref) -> (!vm.ref, !vm.ref) attributes {iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c16 = vm.const.i32 16 + %c1 = vm.const.i32 1 + %c553648160 = vm.const.i32 553648160 + %c3075 = vm.const.i32 3075 + %c48 = vm.const.i32 48 + %c2 = vm.const.i64 2 + %c8 = vm.const.i64 8 + %c64 = vm.const.i64 64 + %c128 = vm.const.i64 128 + %zero = vm.const.i64.zero + %c-1 = vm.const.i64 -1 + %null = vm.const.ref.zero : !vm.ref + %c-1_0 = vm.const.i32 -1 + %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref + %__multiple_results_memoize_result_0_device_0 = vm.global.load.ref immutable @__multiple_results_memoize_result_0_device_0 : !vm.ref + %_utf8_input0_DCE99660CEB3F6B = vm.const.ref.rodata @_utf8_input0_DCE99660CEB3F6B : !vm.buffer + vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DCE99660CEB3F6B, %c553648160, %c1, [%c2]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) + %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref + %ref_1 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref + %_utf8_tensor_FC1814BC4A58F22A = vm.const.ref.rodata @_utf8_tensor_FC1814BC4A58F22A : !vm.buffer + vm.call @hal.buffer.assert(%ref, %_utf8_tensor_FC1814BC4A58F22A, %ref_1, %c8, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () + %_utf8_input1_B898B726583C85DA = vm.const.ref.rodata @_utf8_input1_B898B726583C85DA : !vm.buffer + vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_B898B726583C85DA, %c553648160, %c1, [%c2]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) + %ref_2 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref + %_utf8_tensor_FC1814BC4A58F22A_3 = vm.const.ref.rodata @_utf8_tensor_FC1814BC4A58F22A : !vm.buffer + vm.call @hal.buffer.assert(%ref_2, %_utf8_tensor_FC1814BC4A58F22A_3, %ref_1, %c8, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () + %ref_4 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i64) -> !vm.ref + %ref_5 = vm.call @hal.device.queue.alloca(%__device_0, %c-1, %null, %ref_4, %zero, %c48, %c3075, %c128, %zero) : (!vm.ref, i64, !vm.ref, !vm.ref, i64, i32, i32, i64, i64) -> !vm.ref + %ref_6 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i64) -> !vm.ref + vm.call.variadic @hal.device.queue.execute.indirect(%__device_0, %c-1, %ref_4, %ref_6, %__multiple_results_memoize_result_0_device_0, %zero, [(%ref, %zero, %c8), (%ref_2, %zero, %c8), (%ref_5, %zero, %c128)]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref, i64, tuple, i64, i64> ...) + %0 = vm.call.variadic @hal.fence.await(%c-1_0, %zero, [%ref_6]) : (i32, i64, !vm.ref ...) -> i32 + vm.cond_br %0, ^bb2, ^bb1 + ^bb1: // pred: ^bb0 + %ref_7 = vm.call.variadic @hal.buffer_view.create(%ref_5, %zero, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref + %ref_8 = vm.call.variadic @hal.buffer_view.create(%ref_5, %c64, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref + vm.return %ref_7, %ref_8 : !vm.ref, !vm.ref + ^bb2: // pred: ^bb0 + vm.fail %0, "failed to wait on timepoint" + } + vm.export @multiple_results attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} + } +} + + +// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // +module attributes {vm.toplevel} { + vm.module public @module { + vm.global.ref private @__device_0 : !vm.ref + vm.global.ref private @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref + vm.global.ref private @__multiple_results_memoize_result_0_device_0 : !vm.ref + vm.rodata private @_utf8_hal_device_id_C6650FF277232B5A {alignment = 1 : i64} "hal.device.id" + vm.rodata private @_utf8_local_1A8FF0278D7661D8 {alignment = 1 : i64} "local*" + vm.rodata private @_utf8_hal_executable_format_E03EECB63A2AAF52 {alignment = 1 : i64} "hal.executable.format" + vm.rodata private @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 {alignment = 1 : i64} "embedded-elf-arm_64" + vm.rodata private @multiple_results_dispatch_0_embedded_elf_arm_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8> + vm.initializer { + %c1 = vm.const.i32 1 + %null = vm.const.ref.zero : !vm.buffer + %c14 = vm.const.i32 14 + %c-1 = vm.const.i64 -1 + %c18 = vm.const.i32 18 + %zero = vm.const.i32.zero + %zero_0 = vm.const.i64.zero + %c1_1 = vm.const.i64 1 + %null_2 = vm.const.ref.zero : !vm.ref + %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 + %1 = vm.ext.i32.i64.s %0 : i32 -> i64 + vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref) + ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 + %rnz = vm.cmp.nz.ref %4 : !vm.ref + %5 = vm.xor.i32 %rnz, %c1 : i32 + %slt = vm.cmp.lt.i64.s %2, %1 : i64 + %6 = vm.and.i32 %5, %slt : i32 + vm.cond_br %6, ^bb2, ^bb5 + ^bb2: // pred: ^bb1 + %7 = vm.trunc.i64.i32 %2 : i64 -> i32 + %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref + %_utf8_hal_device_id_C6650FF277232B5A = vm.const.ref.rodata @_utf8_hal_device_id_C6650FF277232B5A : !vm.buffer + %_utf8_local_1A8FF0278D7661D8 = vm.const.ref.rodata @_utf8_local_1A8FF0278D7661D8 : !vm.buffer + %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_C6650FF277232B5A, %_utf8_local_1A8FF0278D7661D8) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) + %nz = vm.cmp.nz.i64 %8#1 : i64 + %9 = vm.select.i32 %8#0, %nz, %zero : i32 + vm.cond_br %9, ^bb3, ^bb4(%zero : i32) + ^bb3: // pred: ^bb2 + %_utf8_hal_executable_format_E03EECB63A2AAF52 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer + %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer + %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_E03EECB63A2AAF52, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) + %nz_3 = vm.cmp.nz.i64 %10#1 : i64 + %11 = vm.select.i32 %10#0, %nz_3, %zero : i32 + vm.br ^bb4(%11 : i32) + ^bb4(%12: i32): // 2 preds: ^bb2, ^bb3 + %eq = vm.cmp.eq.i64 %3, %zero_0 : i64 + %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64 + %14 = vm.add.i64 %3, %13 : i64 + %15 = vm.and.i32 %12, %eq : i32 + %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref + %16 = vm.add.i64 %2, %c1_1 : i64 + vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref) + ^bb5: // pred: ^bb1 + vm.cond_br %5, ^bb6, ^bb7 + ^bb6: // pred: ^bb5 + vm.fail %c18, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>" + ^bb7: // pred: ^bb5 + %_utf8_hal_executable_format_E03EECB63A2AAF52_5 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer + %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer + %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_E03EECB63A2AAF52_5, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) + %nz_7 = vm.cmp.nz.i64 %17#1 : i64 + %18 = vm.select.i32 %17#0, %nz_7, %zero : i32 + %19 = vm.select.i64 %18, %zero_0, %c-1 : i64 + %eq_8 = vm.cmp.eq.i64 %19, %zero_0 : i64 + vm.global.store.ref %4, @__device_0 : !vm.ref + vm.cond_br %eq_8, ^bb8, ^bb9 + ^bb8: // pred: ^bb7 + %multiple_results_dispatch_0_embedded_elf_arm_64 = vm.const.ref.rodata @multiple_results_dispatch_0_embedded_elf_arm_64 : !vm.buffer + %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_9 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer + %ref_10 = vm.call @hal.executable.create(%4, %c-1, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_9, %multiple_results_dispatch_0_embedded_elf_arm_64, %null) {nosideeffects} : (!vm.ref, i64, !vm.buffer, !vm.buffer, !vm.buffer) -> !vm.ref + vm.global.store.ref %ref_10, @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref + %ref_11 = vm.call @__multiple_results_memoize_apply() : () -> !vm.ref + vm.global.store.ref %ref_11, @__multiple_results_memoize_result_0_device_0 : !vm.ref + vm.return + ^bb9: // pred: ^bb7 + vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + } + vm.func private @__multiple_results_memoize_apply() -> !vm.ref attributes {inlining_policy = #util.inline.never} { + %c13 = vm.const.i32 13 + %c28 = vm.const.i32 28 + %c2 = vm.const.i32 2 + %null = vm.const.ref.zero : !vm.ref + %c1 = vm.const.i32 1 + %c3 = vm.const.i32 3 + %c64 = vm.const.i32 64 + %c128 = vm.const.i64 128 + %c8 = vm.const.i64 8 + %zero = vm.const.i64.zero + %zero_0 = vm.const.i32.zero + %c-1 = vm.const.i64 -1 + %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref + %__device_0_executable_0_multiple_results_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref + %ref = vm.call @hal.command_buffer.create(%__device_0, %zero_0, %c3, %c-1, %c3) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref + vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%zero_0], [(%zero_0, %zero_0, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64, i32 ..., tuple, i64, i64> ...) + vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%c64], [(%zero_0, %c1, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64, i32 ..., tuple, i64, i64> ...) + vm.call @hal.command_buffer.execution_barrier(%ref, %c28, %c13, %zero) : (!vm.ref, i32, i32, i64) -> () + vm.call @hal.command_buffer.finalize(%ref) : (!vm.ref) -> () + vm.return %ref : !vm.ref + } + vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) + vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) + vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 6 : i32} + vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) + vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i64) + vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple, i64, i64> ...) + vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects} + vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64, %flags : i64) -> !vm.ref + vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffer : !vm.ref, %flags : i64, %binding_table : tuple, i64, i64> ...) + vm.import private @hal.devices.count() -> i32 attributes {nosideeffects} + vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.executable.create(%device : !vm.ref, %queue_affinity : i64, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.fence.create(%device : !vm.ref, %flags : i64) -> !vm.ref + vm.import private @hal.fence.await(%timeout_millis : i32, %flags : i64, %fences : !vm.ref ...) -> i32 attributes {vm.yield} + vm.rodata private @_utf8_input0_DCE99660CEB3F6B {alignment = 1 : i64} "input0" + vm.rodata private @_utf8_tensor_FC1814BC4A58F22A {alignment = 1 : i64} "tensor" + vm.rodata private @_utf8_input1_B898B726583C85DA {alignment = 1 : i64} "input1" + vm.func private @multiple_results(%arg0: !vm.ref, %arg1: !vm.ref) -> (!vm.ref, !vm.ref) attributes {iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c16 = vm.const.i32 16 + %c1 = vm.const.i32 1 + %c553648160 = vm.const.i32 553648160 + %c3075 = vm.const.i32 3075 + %c48 = vm.const.i32 48 + %c2 = vm.const.i64 2 + %c8 = vm.const.i64 8 + %c64 = vm.const.i64 64 + %c128 = vm.const.i64 128 + %zero = vm.const.i64.zero + %c-1 = vm.const.i64 -1 + %null = vm.const.ref.zero : !vm.ref + %c-1_0 = vm.const.i32 -1 + %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref + %__multiple_results_memoize_result_0_device_0 = vm.global.load.ref immutable @__multiple_results_memoize_result_0_device_0 : !vm.ref + %_utf8_input0_DCE99660CEB3F6B = vm.const.ref.rodata @_utf8_input0_DCE99660CEB3F6B : !vm.buffer + vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DCE99660CEB3F6B, %c553648160, %c1, [%c2]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) + %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref + %ref_1 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref + %_utf8_tensor_FC1814BC4A58F22A = vm.const.ref.rodata @_utf8_tensor_FC1814BC4A58F22A : !vm.buffer + vm.call @hal.buffer.assert(%ref, %_utf8_tensor_FC1814BC4A58F22A, %ref_1, %c8, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () + %_utf8_input1_B898B726583C85DA = vm.const.ref.rodata @_utf8_input1_B898B726583C85DA : !vm.buffer + vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_B898B726583C85DA, %c553648160, %c1, [%c2]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) + %ref_2 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref + %_utf8_tensor_FC1814BC4A58F22A_3 = vm.const.ref.rodata @_utf8_tensor_FC1814BC4A58F22A : !vm.buffer + vm.call @hal.buffer.assert(%ref_2, %_utf8_tensor_FC1814BC4A58F22A_3, %ref_1, %c8, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () + %ref_4 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i64) -> !vm.ref + %ref_5 = vm.call @hal.device.queue.alloca(%__device_0, %c-1, %null, %ref_4, %zero, %c48, %c3075, %c128, %zero) : (!vm.ref, i64, !vm.ref, !vm.ref, i64, i32, i32, i64, i64) -> !vm.ref + %ref_6 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i64) -> !vm.ref + vm.call.variadic @hal.device.queue.execute.indirect(%__device_0, %c-1, %ref_4, %ref_6, %__multiple_results_memoize_result_0_device_0, %zero, [(%ref, %zero, %c8), (%ref_2, %zero, %c8), (%ref_5, %zero, %c128)]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref, i64, tuple, i64, i64> ...) + %0 = vm.call.variadic @hal.fence.await(%c-1_0, %zero, [%ref_6]) : (i32, i64, !vm.ref ...) -> i32 + vm.cond_br %0, ^bb2, ^bb1 + ^bb1: // pred: ^bb0 + %ref_7 = vm.call.variadic @hal.buffer_view.create(%ref_5, %zero, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref + %ref_8 = vm.call.variadic @hal.buffer_view.create(%ref_5, %c64, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref + vm.return %ref_7, %ref_8 : !vm.ref, !vm.ref + ^bb2: // pred: ^bb0 + vm.fail %0, "failed to wait on timepoint" + } + vm.export @multiple_results attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} + } +} + + +// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- // +module attributes {vm.toplevel} { + vm.module public @module { + vm.global.ref private @__device_0 : !vm.ref + vm.global.ref private @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref + vm.global.ref private @__multiple_results_memoize_result_0_device_0 : !vm.ref + vm.rodata private @_utf8_hal_device_id_C6650FF277232B5A {alignment = 1 : i64} "hal.device.id" + vm.rodata private @_utf8_local_1A8FF0278D7661D8 {alignment = 1 : i64} "local*" + vm.rodata private @_utf8_hal_executable_format_E03EECB63A2AAF52 {alignment = 1 : i64} "hal.executable.format" + vm.rodata private @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 {alignment = 1 : i64} "embedded-elf-arm_64" + vm.rodata private @multiple_results_dispatch_0_embedded_elf_arm_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8> + vm.initializer { + %c1 = vm.const.i32 1 + %null = vm.const.ref.zero : !vm.buffer + %c14 = vm.const.i32 14 + %c-1 = vm.const.i64 -1 + %c18 = vm.const.i32 18 + %zero = vm.const.i32.zero + %zero_0 = vm.const.i64.zero + %c1_1 = vm.const.i64 1 + %null_2 = vm.const.ref.zero : !vm.ref + %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 + %1 = vm.ext.i32.i64.s %0 : i32 -> i64 + vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref) + ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 + %rnz = vm.cmp.nz.ref %4 : !vm.ref + %5 = vm.xor.i32 %rnz, %c1 : i32 + %slt = vm.cmp.lt.i64.s %2, %1 : i64 + %6 = vm.and.i32 %5, %slt : i32 + vm.cond_br %6, ^bb2, ^bb5 + ^bb2: // pred: ^bb1 + %7 = vm.trunc.i64.i32 %2 : i64 -> i32 + %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref + %_utf8_hal_device_id_C6650FF277232B5A = vm.const.ref.rodata @_utf8_hal_device_id_C6650FF277232B5A : !vm.buffer + %_utf8_local_1A8FF0278D7661D8 = vm.const.ref.rodata @_utf8_local_1A8FF0278D7661D8 : !vm.buffer + %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_C6650FF277232B5A, %_utf8_local_1A8FF0278D7661D8) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) + %nz = vm.cmp.nz.i64 %8#1 : i64 + %9 = vm.select.i32 %8#0, %nz, %zero : i32 + vm.cond_br %9, ^bb3, ^bb4(%zero : i32) + ^bb3: // pred: ^bb2 + %_utf8_hal_executable_format_E03EECB63A2AAF52 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer + %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer + %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_E03EECB63A2AAF52, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) + %nz_3 = vm.cmp.nz.i64 %10#1 : i64 + %11 = vm.select.i32 %10#0, %nz_3, %zero : i32 + vm.br ^bb4(%11 : i32) + ^bb4(%12: i32): // 2 preds: ^bb2, ^bb3 + %eq = vm.cmp.eq.i64 %3, %zero_0 : i64 + %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64 + %14 = vm.add.i64 %3, %13 : i64 + %15 = vm.and.i32 %12, %eq : i32 + %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref + %16 = vm.add.i64 %2, %c1_1 : i64 + vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref) + ^bb5: // pred: ^bb1 + vm.cond_br %5, ^bb6, ^bb7 + ^bb6: // pred: ^bb5 + vm.fail %c18, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>" + ^bb7: // pred: ^bb5 + %_utf8_hal_executable_format_E03EECB63A2AAF52_5 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer + %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer + %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_E03EECB63A2AAF52_5, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) + %nz_7 = vm.cmp.nz.i64 %17#1 : i64 + %18 = vm.select.i32 %17#0, %nz_7, %zero : i32 + %19 = vm.select.i64 %18, %zero_0, %c-1 : i64 + %eq_8 = vm.cmp.eq.i64 %19, %zero_0 : i64 + vm.global.store.ref %4, @__device_0 : !vm.ref + vm.cond_br %eq_8, ^bb8, ^bb9 + ^bb8: // pred: ^bb7 + %multiple_results_dispatch_0_embedded_elf_arm_64 = vm.const.ref.rodata @multiple_results_dispatch_0_embedded_elf_arm_64 : !vm.buffer + %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_9 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer + %ref_10 = vm.call @hal.executable.create(%4, %c-1, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_9, %multiple_results_dispatch_0_embedded_elf_arm_64, %null) {nosideeffects} : (!vm.ref, i64, !vm.buffer, !vm.buffer, !vm.buffer) -> !vm.ref + vm.global.store.ref %ref_10, @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref + %ref_11 = vm.call @__multiple_results_memoize_apply() : () -> !vm.ref + vm.global.store.ref %ref_11, @__multiple_results_memoize_result_0_device_0 : !vm.ref + vm.return + ^bb9: // pred: ^bb7 + vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + } + vm.func private @__multiple_results_memoize_apply() -> !vm.ref attributes {inlining_policy = #util.inline.never} { + %c13 = vm.const.i32 13 + %c28 = vm.const.i32 28 + %c2 = vm.const.i32 2 + %null = vm.const.ref.zero : !vm.ref + %c1 = vm.const.i32 1 + %c3 = vm.const.i32 3 + %c64 = vm.const.i32 64 + %c128 = vm.const.i64 128 + %c8 = vm.const.i64 8 + %zero = vm.const.i64.zero + %zero_0 = vm.const.i32.zero + %c-1 = vm.const.i64 -1 + %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref + %__device_0_executable_0_multiple_results_dispatch_0 = vm.global.load.ref immutable @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref + %ref = vm.call @hal.command_buffer.create(%__device_0, %zero_0, %c3, %c-1, %c3) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref + vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%zero_0], [(%zero_0, %zero_0, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64, i32 ..., tuple, i64, i64> ...) + vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%c64], [(%zero_0, %c1, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64, i32 ..., tuple, i64, i64> ...) + vm.call @hal.command_buffer.execution_barrier(%ref, %c28, %c13, %zero) : (!vm.ref, i32, i32, i64) -> () + vm.call @hal.command_buffer.finalize(%ref) : (!vm.ref) -> () + vm.return %ref : !vm.ref + } + vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) + vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) + vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 6 : i32} + vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) + vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i64) + vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple, i64, i64> ...) + vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects} + vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64, %flags : i64) -> !vm.ref + vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffer : !vm.ref, %flags : i64, %binding_table : tuple, i64, i64> ...) + vm.import private @hal.devices.count() -> i32 attributes {nosideeffects} + vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.executable.create(%device : !vm.ref, %queue_affinity : i64, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.fence.create(%device : !vm.ref, %flags : i64) -> !vm.ref + vm.import private @hal.fence.await(%timeout_millis : i32, %flags : i64, %fences : !vm.ref ...) -> i32 attributes {vm.yield} + vm.rodata private @_utf8_input0_DCE99660CEB3F6B {alignment = 1 : i64} "input0" + vm.rodata private @_utf8_tensor_FC1814BC4A58F22A {alignment = 1 : i64} "tensor" + vm.rodata private @_utf8_input1_B898B726583C85DA {alignment = 1 : i64} "input1" + vm.func private @multiple_results(%arg0: !vm.ref, %arg1: !vm.ref) -> (!vm.ref, !vm.ref) attributes {iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c16 = vm.const.i32 16 + %c1 = vm.const.i32 1 + %c553648160 = vm.const.i32 553648160 + %c3075 = vm.const.i32 3075 + %c48 = vm.const.i32 48 + %c2 = vm.const.i64 2 + %c8 = vm.const.i64 8 + %c64 = vm.const.i64 64 + %c128 = vm.const.i64 128 + %zero = vm.const.i64.zero + %c-1 = vm.const.i64 -1 + %null = vm.const.ref.zero : !vm.ref + %c-1_0 = vm.const.i32 -1 + %__device_0 = vm.global.load.ref immutable @__device_0 : !vm.ref + %__multiple_results_memoize_result_0_device_0 = vm.global.load.ref immutable @__multiple_results_memoize_result_0_device_0 : !vm.ref + %_utf8_input0_DCE99660CEB3F6B = vm.const.ref.rodata @_utf8_input0_DCE99660CEB3F6B : !vm.buffer + vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DCE99660CEB3F6B, %c553648160, %c1, [%c2]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) + %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref + %ref_1 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref + %_utf8_tensor_FC1814BC4A58F22A = vm.const.ref.rodata @_utf8_tensor_FC1814BC4A58F22A : !vm.buffer + vm.call @hal.buffer.assert(%ref, %_utf8_tensor_FC1814BC4A58F22A, %ref_1, %c8, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () + %_utf8_input1_B898B726583C85DA = vm.const.ref.rodata @_utf8_input1_B898B726583C85DA : !vm.buffer + vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_B898B726583C85DA, %c553648160, %c1, [%c2]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) + %ref_2 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref + %_utf8_tensor_FC1814BC4A58F22A_3 = vm.const.ref.rodata @_utf8_tensor_FC1814BC4A58F22A : !vm.buffer + vm.call @hal.buffer.assert(%ref_2, %_utf8_tensor_FC1814BC4A58F22A_3, %ref_1, %c8, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () + %ref_4 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i64) -> !vm.ref + %ref_5 = vm.call @hal.device.queue.alloca(%__device_0, %c-1, %null, %ref_4, %zero, %c48, %c3075, %c128, %zero) : (!vm.ref, i64, !vm.ref, !vm.ref, i64, i32, i32, i64, i64) -> !vm.ref + %ref_6 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i64) -> !vm.ref + vm.call.variadic @hal.device.queue.execute.indirect(%__device_0, %c-1, %ref_4, %ref_6, %__multiple_results_memoize_result_0_device_0, %zero, [(%ref, %zero, %c8), (%ref_2, %zero, %c8), (%ref_5, %zero, %c128)]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref, i64, tuple, i64, i64> ...) + %0 = vm.call.variadic @hal.fence.await(%c-1_0, %zero, [%ref_6]) : (i32, i64, !vm.ref ...) -> i32 + vm.cond_br %0, ^bb2, ^bb1 + ^bb1: // pred: ^bb0 + %ref_7 = vm.call.variadic @hal.buffer_view.create(%ref_5, %zero, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref + %ref_8 = vm.call.variadic @hal.buffer_view.create(%ref_5, %c64, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref + vm.return %ref_7, %ref_8 : !vm.ref, !vm.ref + ^bb2: // pred: ^bb0 + vm.fail %0, "failed to wait on timepoint" + } + vm.export @multiple_results attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} + } +} + + +// -----// IR Dump After GlobalInitializationPass (iree-vm-global-initialization) //----- // +vm.module public @module { + vm.global.ref private mutable @__device_0 : !vm.ref + vm.global.ref private mutable @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref + vm.global.ref private mutable @__multiple_results_memoize_result_0_device_0 : !vm.ref + vm.rodata private @_utf8_hal_device_id_C6650FF277232B5A {alignment = 1 : i64} "hal.device.id" + vm.rodata private @_utf8_local_1A8FF0278D7661D8 {alignment = 1 : i64} "local*" + vm.rodata private @_utf8_hal_executable_format_E03EECB63A2AAF52 {alignment = 1 : i64} "hal.executable.format" + vm.rodata private @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 {alignment = 1 : i64} "embedded-elf-arm_64" + vm.rodata private @multiple_results_dispatch_0_embedded_elf_arm_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8> + vm.func private @__multiple_results_memoize_apply() -> !vm.ref attributes {inlining_policy = #util.inline.never} { + %c13 = vm.const.i32 13 + %c28 = vm.const.i32 28 + %c2 = vm.const.i32 2 + %null = vm.const.ref.zero : !vm.ref + %c1 = vm.const.i32 1 + %c3 = vm.const.i32 3 + %c64 = vm.const.i32 64 + %c128 = vm.const.i64 128 + %c8 = vm.const.i64 8 + %zero = vm.const.i64.zero + %zero_0 = vm.const.i32.zero + %c-1 = vm.const.i64 -1 + %__device_0 = vm.global.load.ref @__device_0 : !vm.ref + %__device_0_executable_0_multiple_results_dispatch_0 = vm.global.load.ref @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref + %ref = vm.call @hal.command_buffer.create(%__device_0, %zero_0, %c3, %c-1, %c3) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref + vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%zero_0], [(%zero_0, %zero_0, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64, i32 ..., tuple, i64, i64> ...) + vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%c64], [(%zero_0, %c1, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64, i32 ..., tuple, i64, i64> ...) + vm.call @hal.command_buffer.execution_barrier(%ref, %c28, %c13, %zero) : (!vm.ref, i32, i32, i64) -> () + vm.call @hal.command_buffer.finalize(%ref) : (!vm.ref) -> () + vm.return %ref : !vm.ref + } + vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) + vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) + vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 6 : i32} + vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) + vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i64) + vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple, i64, i64> ...) + vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects} + vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64, %flags : i64) -> !vm.ref + vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffer : !vm.ref, %flags : i64, %binding_table : tuple, i64, i64> ...) + vm.import private @hal.devices.count() -> i32 attributes {nosideeffects} + vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.executable.create(%device : !vm.ref, %queue_affinity : i64, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.fence.create(%device : !vm.ref, %flags : i64) -> !vm.ref + vm.import private @hal.fence.await(%timeout_millis : i32, %flags : i64, %fences : !vm.ref ...) -> i32 attributes {vm.yield} + vm.rodata private @_utf8_input0_DCE99660CEB3F6B {alignment = 1 : i64} "input0" + vm.rodata private @_utf8_tensor_FC1814BC4A58F22A {alignment = 1 : i64} "tensor" + vm.rodata private @_utf8_input1_B898B726583C85DA {alignment = 1 : i64} "input1" + vm.func private @multiple_results(%arg0: !vm.ref, %arg1: !vm.ref) -> (!vm.ref, !vm.ref) attributes {iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c16 = vm.const.i32 16 + %c1 = vm.const.i32 1 + %c553648160 = vm.const.i32 553648160 + %c3075 = vm.const.i32 3075 + %c48 = vm.const.i32 48 + %c2 = vm.const.i64 2 + %c8 = vm.const.i64 8 + %c64 = vm.const.i64 64 + %c128 = vm.const.i64 128 + %zero = vm.const.i64.zero + %c-1 = vm.const.i64 -1 + %null = vm.const.ref.zero : !vm.ref + %c-1_0 = vm.const.i32 -1 + %__device_0 = vm.global.load.ref @__device_0 : !vm.ref + %__multiple_results_memoize_result_0_device_0 = vm.global.load.ref @__multiple_results_memoize_result_0_device_0 : !vm.ref + %_utf8_input0_DCE99660CEB3F6B = vm.const.ref.rodata @_utf8_input0_DCE99660CEB3F6B : !vm.buffer + vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DCE99660CEB3F6B, %c553648160, %c1, [%c2]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) + %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref + %ref_1 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref + %_utf8_tensor_FC1814BC4A58F22A = vm.const.ref.rodata @_utf8_tensor_FC1814BC4A58F22A : !vm.buffer + vm.call @hal.buffer.assert(%ref, %_utf8_tensor_FC1814BC4A58F22A, %ref_1, %c8, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () + %_utf8_input1_B898B726583C85DA = vm.const.ref.rodata @_utf8_input1_B898B726583C85DA : !vm.buffer + vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_B898B726583C85DA, %c553648160, %c1, [%c2]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) + %ref_2 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref + %_utf8_tensor_FC1814BC4A58F22A_3 = vm.const.ref.rodata @_utf8_tensor_FC1814BC4A58F22A : !vm.buffer + vm.call @hal.buffer.assert(%ref_2, %_utf8_tensor_FC1814BC4A58F22A_3, %ref_1, %c8, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () + %ref_4 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i64) -> !vm.ref + %ref_5 = vm.call @hal.device.queue.alloca(%__device_0, %c-1, %null, %ref_4, %zero, %c48, %c3075, %c128, %zero) : (!vm.ref, i64, !vm.ref, !vm.ref, i64, i32, i32, i64, i64) -> !vm.ref + %ref_6 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i64) -> !vm.ref + vm.call.variadic @hal.device.queue.execute.indirect(%__device_0, %c-1, %ref_4, %ref_6, %__multiple_results_memoize_result_0_device_0, %zero, [(%ref, %zero, %c8), (%ref_2, %zero, %c8), (%ref_5, %zero, %c128)]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref, i64, tuple, i64, i64> ...) + %0 = vm.call.variadic @hal.fence.await(%c-1_0, %zero, [%ref_6]) : (i32, i64, !vm.ref ...) -> i32 + vm.cond_br %0, ^bb2, ^bb1 + ^bb1: // pred: ^bb0 + %ref_7 = vm.call.variadic @hal.buffer_view.create(%ref_5, %zero, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref + %ref_8 = vm.call.variadic @hal.buffer_view.create(%ref_5, %c64, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref + vm.return %ref_7, %ref_8 : !vm.ref, !vm.ref + ^bb2: // pred: ^bb0 + vm.fail %0, "failed to wait on timepoint" + } + vm.export @multiple_results attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} + vm.export @__init + vm.func private @__init() { + %c1 = vm.const.i32 1 + %null = vm.const.ref.zero : !vm.buffer + %c14 = vm.const.i32 14 + %c-1 = vm.const.i64 -1 + %c18 = vm.const.i32 18 + %zero = vm.const.i32.zero + %zero_0 = vm.const.i64.zero + %c1_1 = vm.const.i64 1 + %null_2 = vm.const.ref.zero : !vm.ref + %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 + %1 = vm.ext.i32.i64.s %0 : i32 -> i64 + vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref) + ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 + %rnz = vm.cmp.nz.ref %4 : !vm.ref + %5 = vm.xor.i32 %rnz, %c1 : i32 + %slt = vm.cmp.lt.i64.s %2, %1 : i64 + %6 = vm.and.i32 %5, %slt : i32 + vm.cond_br %6, ^bb2, ^bb5 + ^bb2: // pred: ^bb1 + %7 = vm.trunc.i64.i32 %2 : i64 -> i32 + %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref + %_utf8_hal_device_id_C6650FF277232B5A = vm.const.ref.rodata @_utf8_hal_device_id_C6650FF277232B5A : !vm.buffer + %_utf8_local_1A8FF0278D7661D8 = vm.const.ref.rodata @_utf8_local_1A8FF0278D7661D8 : !vm.buffer + %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_C6650FF277232B5A, %_utf8_local_1A8FF0278D7661D8) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) + %nz = vm.cmp.nz.i64 %8#1 : i64 + %9 = vm.select.i32 %8#0, %nz, %zero : i32 + vm.cond_br %9, ^bb3, ^bb4(%zero : i32) + ^bb3: // pred: ^bb2 + %_utf8_hal_executable_format_E03EECB63A2AAF52 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer + %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer + %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_E03EECB63A2AAF52, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) + %nz_3 = vm.cmp.nz.i64 %10#1 : i64 + %11 = vm.select.i32 %10#0, %nz_3, %zero : i32 + vm.br ^bb4(%11 : i32) + ^bb4(%12: i32): // 2 preds: ^bb2, ^bb3 + %eq = vm.cmp.eq.i64 %3, %zero_0 : i64 + %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64 + %14 = vm.add.i64 %3, %13 : i64 + %15 = vm.and.i32 %12, %eq : i32 + %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref + %16 = vm.add.i64 %2, %c1_1 : i64 + vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref) + ^bb5: // pred: ^bb1 + vm.cond_br %5, ^bb6, ^bb7 + ^bb6: // pred: ^bb5 + vm.fail %c18, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>" + ^bb7: // pred: ^bb5 + %_utf8_hal_executable_format_E03EECB63A2AAF52_5 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer + %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer + %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_E03EECB63A2AAF52_5, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) + %nz_7 = vm.cmp.nz.i64 %17#1 : i64 + %18 = vm.select.i32 %17#0, %nz_7, %zero : i32 + %19 = vm.select.i64 %18, %zero_0, %c-1 : i64 + %eq_8 = vm.cmp.eq.i64 %19, %zero_0 : i64 + vm.global.store.ref %4, @__device_0 : !vm.ref + vm.cond_br %eq_8, ^bb8, ^bb9 + ^bb8: // pred: ^bb7 + %multiple_results_dispatch_0_embedded_elf_arm_64 = vm.const.ref.rodata @multiple_results_dispatch_0_embedded_elf_arm_64 : !vm.buffer + %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_9 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer + %ref_10 = vm.call @hal.executable.create(%4, %c-1, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_9, %multiple_results_dispatch_0_embedded_elf_arm_64, %null) {nosideeffects} : (!vm.ref, i64, !vm.buffer, !vm.buffer, !vm.buffer) -> !vm.ref + vm.global.store.ref %ref_10, @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref + %ref_11 = vm.call @__multiple_results_memoize_apply() : () -> !vm.ref + vm.global.store.ref %ref_11, @__multiple_results_memoize_result_0_device_0 : !vm.ref + vm.br ^bb10 + ^bb9: // pred: ^bb7 + vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + ^bb10: // pred: ^bb8 + vm.return + } + vm.export @__deinit + vm.func private @__deinit() { + vm.return + } +} + +// -----// IR Dump After Canonicalizer (canonicalize) //----- // +module attributes {vm.toplevel} { + vm.module public @module { + vm.global.ref private mutable @__device_0 : !vm.ref + vm.global.ref private mutable @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref + vm.global.ref private mutable @__multiple_results_memoize_result_0_device_0 : !vm.ref + vm.rodata private @_utf8_hal_device_id_C6650FF277232B5A {alignment = 1 : i64} "hal.device.id" + vm.rodata private @_utf8_local_1A8FF0278D7661D8 {alignment = 1 : i64} "local*" + vm.rodata private @_utf8_hal_executable_format_E03EECB63A2AAF52 {alignment = 1 : i64} "hal.executable.format" + vm.rodata private @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 {alignment = 1 : i64} "embedded-elf-arm_64" + vm.rodata private @multiple_results_dispatch_0_embedded_elf_arm_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8> + vm.func private @__multiple_results_memoize_apply() -> !vm.ref attributes {inlining_policy = #util.inline.never} { + %c13 = vm.const.i32 13 + %c28 = vm.const.i32 28 + %c2 = vm.const.i32 2 + %null = vm.const.ref.zero : !vm.ref + %c1 = vm.const.i32 1 + %c3 = vm.const.i32 3 + %c64 = vm.const.i32 64 + %c128 = vm.const.i64 128 + %c8 = vm.const.i64 8 + %zero = vm.const.i64.zero + %zero_0 = vm.const.i32.zero + %c-1 = vm.const.i64 -1 + %__device_0 = vm.global.load.ref @__device_0 : !vm.ref + %__device_0_executable_0_multiple_results_dispatch_0 = vm.global.load.ref @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref + %ref = vm.call @hal.command_buffer.create(%__device_0, %zero_0, %c3, %c-1, %c3) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref + vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%zero_0], [(%zero_0, %zero_0, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64, i32 ..., tuple, i64, i64> ...) + vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%c64], [(%zero_0, %c1, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64, i32 ..., tuple, i64, i64> ...) + vm.call @hal.command_buffer.execution_barrier(%ref, %c28, %c13, %zero) : (!vm.ref, i32, i32, i64) -> () + vm.call @hal.command_buffer.finalize(%ref) : (!vm.ref) -> () + vm.return %ref : !vm.ref + } + vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) + vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) + vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 6 : i32} + vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) + vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i64) + vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple, i64, i64> ...) + vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects} + vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64, %flags : i64) -> !vm.ref + vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffer : !vm.ref, %flags : i64, %binding_table : tuple, i64, i64> ...) + vm.import private @hal.devices.count() -> i32 attributes {nosideeffects} + vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.executable.create(%device : !vm.ref, %queue_affinity : i64, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.fence.create(%device : !vm.ref, %flags : i64) -> !vm.ref + vm.import private @hal.fence.await(%timeout_millis : i32, %flags : i64, %fences : !vm.ref ...) -> i32 attributes {vm.yield} + vm.rodata private @_utf8_input0_DCE99660CEB3F6B {alignment = 1 : i64} "input0" + vm.rodata private @_utf8_tensor_FC1814BC4A58F22A {alignment = 1 : i64} "tensor" + vm.rodata private @_utf8_input1_B898B726583C85DA {alignment = 1 : i64} "input1" + vm.func private @multiple_results(%arg0: !vm.ref, %arg1: !vm.ref) -> (!vm.ref, !vm.ref) attributes {iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c16 = vm.const.i32 16 + %c1 = vm.const.i32 1 + %c553648160 = vm.const.i32 553648160 + %c3075 = vm.const.i32 3075 + %c48 = vm.const.i32 48 + %c2 = vm.const.i64 2 + %c8 = vm.const.i64 8 + %c64 = vm.const.i64 64 + %c128 = vm.const.i64 128 + %zero = vm.const.i64.zero + %c-1 = vm.const.i64 -1 + %null = vm.const.ref.zero : !vm.ref + %c-1_0 = vm.const.i32 -1 + %__device_0 = vm.global.load.ref @__device_0 : !vm.ref + %__multiple_results_memoize_result_0_device_0 = vm.global.load.ref @__multiple_results_memoize_result_0_device_0 : !vm.ref + %_utf8_input0_DCE99660CEB3F6B = vm.const.ref.rodata @_utf8_input0_DCE99660CEB3F6B : !vm.buffer + vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DCE99660CEB3F6B, %c553648160, %c1, [%c2]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) + %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref + %ref_1 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref + %_utf8_tensor_FC1814BC4A58F22A = vm.const.ref.rodata @_utf8_tensor_FC1814BC4A58F22A : !vm.buffer + vm.call @hal.buffer.assert(%ref, %_utf8_tensor_FC1814BC4A58F22A, %ref_1, %c8, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () + %_utf8_input1_B898B726583C85DA = vm.const.ref.rodata @_utf8_input1_B898B726583C85DA : !vm.buffer + vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_B898B726583C85DA, %c553648160, %c1, [%c2]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) + %ref_2 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref + %_utf8_tensor_FC1814BC4A58F22A_3 = vm.const.ref.rodata @_utf8_tensor_FC1814BC4A58F22A : !vm.buffer + vm.call @hal.buffer.assert(%ref_2, %_utf8_tensor_FC1814BC4A58F22A_3, %ref_1, %c8, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () + %ref_4 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i64) -> !vm.ref + %ref_5 = vm.call @hal.device.queue.alloca(%__device_0, %c-1, %null, %ref_4, %zero, %c48, %c3075, %c128, %zero) : (!vm.ref, i64, !vm.ref, !vm.ref, i64, i32, i32, i64, i64) -> !vm.ref + %ref_6 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i64) -> !vm.ref + vm.call.variadic @hal.device.queue.execute.indirect(%__device_0, %c-1, %ref_4, %ref_6, %__multiple_results_memoize_result_0_device_0, %zero, [(%ref, %zero, %c8), (%ref_2, %zero, %c8), (%ref_5, %zero, %c128)]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref, i64, tuple, i64, i64> ...) + %0 = vm.call.variadic @hal.fence.await(%c-1_0, %zero, [%ref_6]) : (i32, i64, !vm.ref ...) -> i32 + vm.cond_br %0, ^bb2, ^bb1 + ^bb1: // pred: ^bb0 + %ref_7 = vm.call.variadic @hal.buffer_view.create(%ref_5, %zero, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref + %ref_8 = vm.call.variadic @hal.buffer_view.create(%ref_5, %c64, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref + vm.return %ref_7, %ref_8 : !vm.ref, !vm.ref + ^bb2: // pred: ^bb0 + vm.fail %0, "failed to wait on timepoint" + } + vm.export @multiple_results attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} + vm.export @__init + vm.func private @__init() { + %c1 = vm.const.i32 1 + %null = vm.const.ref.zero : !vm.buffer + %c14 = vm.const.i32 14 + %c-1 = vm.const.i64 -1 + %c18 = vm.const.i32 18 + %zero = vm.const.i32.zero + %zero_0 = vm.const.i64.zero + %c1_1 = vm.const.i64 1 + %null_2 = vm.const.ref.zero : !vm.ref + %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 + %1 = vm.ext.i32.i64.s %0 : i32 -> i64 + vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref) + ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 + %rnz = vm.cmp.nz.ref %4 : !vm.ref + %5 = vm.xor.i32 %rnz, %c1 : i32 + %slt = vm.cmp.lt.i64.s %2, %1 : i64 + %6 = vm.and.i32 %5, %slt : i32 + vm.cond_br %6, ^bb2, ^bb5 + ^bb2: // pred: ^bb1 + %7 = vm.trunc.i64.i32 %2 : i64 -> i32 + %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref + %_utf8_hal_device_id_C6650FF277232B5A = vm.const.ref.rodata @_utf8_hal_device_id_C6650FF277232B5A : !vm.buffer + %_utf8_local_1A8FF0278D7661D8 = vm.const.ref.rodata @_utf8_local_1A8FF0278D7661D8 : !vm.buffer + %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_C6650FF277232B5A, %_utf8_local_1A8FF0278D7661D8) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) + %nz = vm.cmp.nz.i64 %8#1 : i64 + %9 = vm.select.i32 %8#0, %nz, %zero : i32 + vm.cond_br %9, ^bb3, ^bb4(%zero : i32) + ^bb3: // pred: ^bb2 + %_utf8_hal_executable_format_E03EECB63A2AAF52 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer + %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer + %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_E03EECB63A2AAF52, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) + %nz_3 = vm.cmp.nz.i64 %10#1 : i64 + %11 = vm.select.i32 %10#0, %nz_3, %zero : i32 + vm.br ^bb4(%11 : i32) + ^bb4(%12: i32): // 2 preds: ^bb2, ^bb3 + %eq = vm.cmp.eq.i64 %3, %zero_0 : i64 + %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64 + %14 = vm.add.i64 %3, %13 : i64 + %15 = vm.and.i32 %12, %eq : i32 + %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref + %16 = vm.add.i64 %2, %c1_1 : i64 + vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref) + ^bb5: // pred: ^bb1 + vm.cond_br %5, ^bb6, ^bb7 + ^bb6: // pred: ^bb5 + vm.fail %c18, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>" + ^bb7: // pred: ^bb5 + %_utf8_hal_executable_format_E03EECB63A2AAF52_5 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer + %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer + %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_E03EECB63A2AAF52_5, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) + %nz_7 = vm.cmp.nz.i64 %17#1 : i64 + %18 = vm.select.i32 %17#0, %nz_7, %zero : i32 + %19 = vm.select.i64 %18, %zero_0, %c-1 : i64 + %eq_8 = vm.cmp.eq.i64 %19, %zero_0 : i64 + vm.global.store.ref %4, @__device_0 : !vm.ref + vm.cond_br %eq_8, ^bb8, ^bb9 + ^bb8: // pred: ^bb7 + %multiple_results_dispatch_0_embedded_elf_arm_64 = vm.const.ref.rodata @multiple_results_dispatch_0_embedded_elf_arm_64 : !vm.buffer + %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_9 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer + %ref_10 = vm.call @hal.executable.create(%4, %c-1, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_9, %multiple_results_dispatch_0_embedded_elf_arm_64, %null) {nosideeffects} : (!vm.ref, i64, !vm.buffer, !vm.buffer, !vm.buffer) -> !vm.ref + vm.global.store.ref %ref_10, @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref + %ref_11 = vm.call @__multiple_results_memoize_apply() : () -> !vm.ref + vm.global.store.ref %ref_11, @__multiple_results_memoize_result_0_device_0 : !vm.ref + vm.return + ^bb9: // pred: ^bb7 + vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + } + vm.export @__deinit + vm.func private @__deinit() { + vm.return + } + } +} + + +// -----// IR Dump After CSE (cse) //----- // +module attributes {vm.toplevel} { + vm.module public @module { + vm.global.ref private mutable @__device_0 : !vm.ref + vm.global.ref private mutable @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref + vm.global.ref private mutable @__multiple_results_memoize_result_0_device_0 : !vm.ref + vm.rodata private @_utf8_hal_device_id_C6650FF277232B5A {alignment = 1 : i64} "hal.device.id" + vm.rodata private @_utf8_local_1A8FF0278D7661D8 {alignment = 1 : i64} "local*" + vm.rodata private @_utf8_hal_executable_format_E03EECB63A2AAF52 {alignment = 1 : i64} "hal.executable.format" + vm.rodata private @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 {alignment = 1 : i64} "embedded-elf-arm_64" + vm.rodata private @multiple_results_dispatch_0_embedded_elf_arm_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8> + vm.func private @__multiple_results_memoize_apply() -> !vm.ref attributes {inlining_policy = #util.inline.never} { + %c13 = vm.const.i32 13 + %c28 = vm.const.i32 28 + %c2 = vm.const.i32 2 + %null = vm.const.ref.zero : !vm.ref + %c1 = vm.const.i32 1 + %c3 = vm.const.i32 3 + %c64 = vm.const.i32 64 + %c128 = vm.const.i64 128 + %c8 = vm.const.i64 8 + %zero = vm.const.i64.zero + %zero_0 = vm.const.i32.zero + %c-1 = vm.const.i64 -1 + %__device_0 = vm.global.load.ref @__device_0 : !vm.ref + %__device_0_executable_0_multiple_results_dispatch_0 = vm.global.load.ref @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref + %ref = vm.call @hal.command_buffer.create(%__device_0, %zero_0, %c3, %c-1, %c3) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref + vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%zero_0], [(%zero_0, %zero_0, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64, i32 ..., tuple, i64, i64> ...) + vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%c64], [(%zero_0, %c1, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64, i32 ..., tuple, i64, i64> ...) + vm.call @hal.command_buffer.execution_barrier(%ref, %c28, %c13, %zero) : (!vm.ref, i32, i32, i64) -> () + vm.call @hal.command_buffer.finalize(%ref) : (!vm.ref) -> () + vm.return %ref : !vm.ref + } + vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) + vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) + vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 6 : i32} + vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) + vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i64) + vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple, i64, i64> ...) + vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects} + vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64, %flags : i64) -> !vm.ref + vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffer : !vm.ref, %flags : i64, %binding_table : tuple, i64, i64> ...) + vm.import private @hal.devices.count() -> i32 attributes {nosideeffects} + vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.executable.create(%device : !vm.ref, %queue_affinity : i64, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.fence.create(%device : !vm.ref, %flags : i64) -> !vm.ref + vm.import private @hal.fence.await(%timeout_millis : i32, %flags : i64, %fences : !vm.ref ...) -> i32 attributes {vm.yield} + vm.rodata private @_utf8_input0_DCE99660CEB3F6B {alignment = 1 : i64} "input0" + vm.rodata private @_utf8_tensor_FC1814BC4A58F22A {alignment = 1 : i64} "tensor" + vm.rodata private @_utf8_input1_B898B726583C85DA {alignment = 1 : i64} "input1" + vm.func private @multiple_results(%arg0: !vm.ref, %arg1: !vm.ref) -> (!vm.ref, !vm.ref) attributes {iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c16 = vm.const.i32 16 + %c1 = vm.const.i32 1 + %c553648160 = vm.const.i32 553648160 + %c3075 = vm.const.i32 3075 + %c48 = vm.const.i32 48 + %c2 = vm.const.i64 2 + %c8 = vm.const.i64 8 + %c64 = vm.const.i64 64 + %c128 = vm.const.i64 128 + %zero = vm.const.i64.zero + %c-1 = vm.const.i64 -1 + %null = vm.const.ref.zero : !vm.ref + %c-1_0 = vm.const.i32 -1 + %__device_0 = vm.global.load.ref @__device_0 : !vm.ref + %__multiple_results_memoize_result_0_device_0 = vm.global.load.ref @__multiple_results_memoize_result_0_device_0 : !vm.ref + %_utf8_input0_DCE99660CEB3F6B = vm.const.ref.rodata @_utf8_input0_DCE99660CEB3F6B : !vm.buffer + vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DCE99660CEB3F6B, %c553648160, %c1, [%c2]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) + %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref + %ref_1 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref + %_utf8_tensor_FC1814BC4A58F22A = vm.const.ref.rodata @_utf8_tensor_FC1814BC4A58F22A : !vm.buffer + vm.call @hal.buffer.assert(%ref, %_utf8_tensor_FC1814BC4A58F22A, %ref_1, %c8, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () + %_utf8_input1_B898B726583C85DA = vm.const.ref.rodata @_utf8_input1_B898B726583C85DA : !vm.buffer + vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_B898B726583C85DA, %c553648160, %c1, [%c2]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) + %ref_2 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref + vm.call @hal.buffer.assert(%ref_2, %_utf8_tensor_FC1814BC4A58F22A, %ref_1, %c8, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () + %ref_3 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i64) -> !vm.ref + %ref_4 = vm.call @hal.device.queue.alloca(%__device_0, %c-1, %null, %ref_3, %zero, %c48, %c3075, %c128, %zero) : (!vm.ref, i64, !vm.ref, !vm.ref, i64, i32, i32, i64, i64) -> !vm.ref + %ref_5 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i64) -> !vm.ref + vm.call.variadic @hal.device.queue.execute.indirect(%__device_0, %c-1, %ref_3, %ref_5, %__multiple_results_memoize_result_0_device_0, %zero, [(%ref, %zero, %c8), (%ref_2, %zero, %c8), (%ref_4, %zero, %c128)]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref, i64, tuple, i64, i64> ...) + %0 = vm.call.variadic @hal.fence.await(%c-1_0, %zero, [%ref_5]) : (i32, i64, !vm.ref ...) -> i32 + vm.cond_br %0, ^bb2, ^bb1 + ^bb1: // pred: ^bb0 + %ref_6 = vm.call.variadic @hal.buffer_view.create(%ref_4, %zero, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref + %ref_7 = vm.call.variadic @hal.buffer_view.create(%ref_4, %c64, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref + vm.return %ref_6, %ref_7 : !vm.ref, !vm.ref + ^bb2: // pred: ^bb0 + vm.fail %0, "failed to wait on timepoint" + } + vm.export @multiple_results attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} + vm.export @__init + vm.func private @__init() { + %c1 = vm.const.i32 1 + %null = vm.const.ref.zero : !vm.buffer + %c14 = vm.const.i32 14 + %c-1 = vm.const.i64 -1 + %c18 = vm.const.i32 18 + %zero = vm.const.i32.zero + %zero_0 = vm.const.i64.zero + %c1_1 = vm.const.i64 1 + %null_2 = vm.const.ref.zero : !vm.ref + %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 + %1 = vm.ext.i32.i64.s %0 : i32 -> i64 + vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref) + ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 + %rnz = vm.cmp.nz.ref %4 : !vm.ref + %5 = vm.xor.i32 %rnz, %c1 : i32 + %slt = vm.cmp.lt.i64.s %2, %1 : i64 + %6 = vm.and.i32 %5, %slt : i32 + vm.cond_br %6, ^bb2, ^bb5 + ^bb2: // pred: ^bb1 + %7 = vm.trunc.i64.i32 %2 : i64 -> i32 + %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref + %_utf8_hal_device_id_C6650FF277232B5A = vm.const.ref.rodata @_utf8_hal_device_id_C6650FF277232B5A : !vm.buffer + %_utf8_local_1A8FF0278D7661D8 = vm.const.ref.rodata @_utf8_local_1A8FF0278D7661D8 : !vm.buffer + %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_C6650FF277232B5A, %_utf8_local_1A8FF0278D7661D8) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) + %nz = vm.cmp.nz.i64 %8#1 : i64 + %9 = vm.select.i32 %8#0, %nz, %zero : i32 + vm.cond_br %9, ^bb3, ^bb4(%zero : i32) + ^bb3: // pred: ^bb2 + %_utf8_hal_executable_format_E03EECB63A2AAF52 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer + %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer + %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_E03EECB63A2AAF52, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) + %nz_3 = vm.cmp.nz.i64 %10#1 : i64 + %11 = vm.select.i32 %10#0, %nz_3, %zero : i32 + vm.br ^bb4(%11 : i32) + ^bb4(%12: i32): // 2 preds: ^bb2, ^bb3 + %eq = vm.cmp.eq.i64 %3, %zero_0 : i64 + %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64 + %14 = vm.add.i64 %3, %13 : i64 + %15 = vm.and.i32 %12, %eq : i32 + %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref + %16 = vm.add.i64 %2, %c1_1 : i64 + vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref) + ^bb5: // pred: ^bb1 + vm.cond_br %5, ^bb6, ^bb7 + ^bb6: // pred: ^bb5 + vm.fail %c18, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>" + ^bb7: // pred: ^bb5 + %_utf8_hal_executable_format_E03EECB63A2AAF52_5 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer + %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer + %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_E03EECB63A2AAF52_5, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) + %nz_7 = vm.cmp.nz.i64 %17#1 : i64 + %18 = vm.select.i32 %17#0, %nz_7, %zero : i32 + %19 = vm.select.i64 %18, %zero_0, %c-1 : i64 + %eq_8 = vm.cmp.eq.i64 %19, %zero_0 : i64 + vm.global.store.ref %4, @__device_0 : !vm.ref + vm.cond_br %eq_8, ^bb8, ^bb9 + ^bb8: // pred: ^bb7 + %multiple_results_dispatch_0_embedded_elf_arm_64 = vm.const.ref.rodata @multiple_results_dispatch_0_embedded_elf_arm_64 : !vm.buffer + %ref_9 = vm.call @hal.executable.create(%4, %c-1, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6, %multiple_results_dispatch_0_embedded_elf_arm_64, %null) {nosideeffects} : (!vm.ref, i64, !vm.buffer, !vm.buffer, !vm.buffer) -> !vm.ref + vm.global.store.ref %ref_9, @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref + %ref_10 = vm.call @__multiple_results_memoize_apply() : () -> !vm.ref + vm.global.store.ref %ref_10, @__multiple_results_memoize_result_0_device_0 : !vm.ref + vm.return + ^bb9: // pred: ^bb7 + vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + } + vm.export @__deinit + vm.func private @__deinit() { + vm.return + } + } +} + + +// -----// IR Dump After Canonicalizer (canonicalize) //----- // +module attributes {vm.toplevel} { + vm.module public @module { + vm.global.ref private mutable @__device_0 : !vm.ref + vm.global.ref private mutable @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref + vm.global.ref private mutable @__multiple_results_memoize_result_0_device_0 : !vm.ref + vm.rodata private @_utf8_hal_device_id_C6650FF277232B5A {alignment = 1 : i64} "hal.device.id" + vm.rodata private @_utf8_local_1A8FF0278D7661D8 {alignment = 1 : i64} "local*" + vm.rodata private @_utf8_hal_executable_format_E03EECB63A2AAF52 {alignment = 1 : i64} "hal.executable.format" + vm.rodata private @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 {alignment = 1 : i64} "embedded-elf-arm_64" + vm.rodata private @multiple_results_dispatch_0_embedded_elf_arm_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8> + vm.func private @__multiple_results_memoize_apply() -> !vm.ref attributes {inlining_policy = #util.inline.never} { + %c13 = vm.const.i32 13 + %c28 = vm.const.i32 28 + %c2 = vm.const.i32 2 + %null = vm.const.ref.zero : !vm.ref + %c1 = vm.const.i32 1 + %c3 = vm.const.i32 3 + %c64 = vm.const.i32 64 + %c128 = vm.const.i64 128 + %c8 = vm.const.i64 8 + %zero = vm.const.i64.zero + %zero_0 = vm.const.i32.zero + %c-1 = vm.const.i64 -1 + %__device_0 = vm.global.load.ref @__device_0 : !vm.ref + %__device_0_executable_0_multiple_results_dispatch_0 = vm.global.load.ref @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref + %ref = vm.call @hal.command_buffer.create(%__device_0, %zero_0, %c3, %c-1, %c3) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref + vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%zero_0], [(%zero_0, %zero_0, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64, i32 ..., tuple, i64, i64> ...) + vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%c64], [(%zero_0, %c1, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64, i32 ..., tuple, i64, i64> ...) + vm.call @hal.command_buffer.execution_barrier(%ref, %c28, %c13, %zero) : (!vm.ref, i32, i32, i64) -> () + vm.call @hal.command_buffer.finalize(%ref) : (!vm.ref) -> () + vm.return %ref : !vm.ref + } + vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) + vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) + vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 6 : i32} + vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) + vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i64) + vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple, i64, i64> ...) + vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects} + vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64, %flags : i64) -> !vm.ref + vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffer : !vm.ref, %flags : i64, %binding_table : tuple, i64, i64> ...) + vm.import private @hal.devices.count() -> i32 attributes {nosideeffects} + vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.executable.create(%device : !vm.ref, %queue_affinity : i64, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.fence.create(%device : !vm.ref, %flags : i64) -> !vm.ref + vm.import private @hal.fence.await(%timeout_millis : i32, %flags : i64, %fences : !vm.ref ...) -> i32 attributes {vm.yield} + vm.rodata private @_utf8_input0_DCE99660CEB3F6B {alignment = 1 : i64} "input0" + vm.rodata private @_utf8_tensor_FC1814BC4A58F22A {alignment = 1 : i64} "tensor" + vm.rodata private @_utf8_input1_B898B726583C85DA {alignment = 1 : i64} "input1" + vm.func private @multiple_results(%arg0: !vm.ref, %arg1: !vm.ref) -> (!vm.ref, !vm.ref) attributes {iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c16 = vm.const.i32 16 + %c1 = vm.const.i32 1 + %c553648160 = vm.const.i32 553648160 + %c3075 = vm.const.i32 3075 + %c48 = vm.const.i32 48 + %c2 = vm.const.i64 2 + %c8 = vm.const.i64 8 + %c64 = vm.const.i64 64 + %c128 = vm.const.i64 128 + %zero = vm.const.i64.zero + %c-1 = vm.const.i64 -1 + %null = vm.const.ref.zero : !vm.ref + %c-1_0 = vm.const.i32 -1 + %__device_0 = vm.global.load.ref @__device_0 : !vm.ref + %__multiple_results_memoize_result_0_device_0 = vm.global.load.ref @__multiple_results_memoize_result_0_device_0 : !vm.ref + %_utf8_input0_DCE99660CEB3F6B = vm.const.ref.rodata @_utf8_input0_DCE99660CEB3F6B : !vm.buffer + vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DCE99660CEB3F6B, %c553648160, %c1, [%c2]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) + %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref + %ref_1 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref + %_utf8_tensor_FC1814BC4A58F22A = vm.const.ref.rodata @_utf8_tensor_FC1814BC4A58F22A : !vm.buffer + vm.call @hal.buffer.assert(%ref, %_utf8_tensor_FC1814BC4A58F22A, %ref_1, %c8, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () + %_utf8_input1_B898B726583C85DA = vm.const.ref.rodata @_utf8_input1_B898B726583C85DA : !vm.buffer + vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_B898B726583C85DA, %c553648160, %c1, [%c2]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) + %ref_2 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref + vm.call @hal.buffer.assert(%ref_2, %_utf8_tensor_FC1814BC4A58F22A, %ref_1, %c8, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () + %ref_3 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i64) -> !vm.ref + %ref_4 = vm.call @hal.device.queue.alloca(%__device_0, %c-1, %null, %ref_3, %zero, %c48, %c3075, %c128, %zero) : (!vm.ref, i64, !vm.ref, !vm.ref, i64, i32, i32, i64, i64) -> !vm.ref + %ref_5 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i64) -> !vm.ref + vm.call.variadic @hal.device.queue.execute.indirect(%__device_0, %c-1, %ref_3, %ref_5, %__multiple_results_memoize_result_0_device_0, %zero, [(%ref, %zero, %c8), (%ref_2, %zero, %c8), (%ref_4, %zero, %c128)]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref, i64, tuple, i64, i64> ...) + %0 = vm.call.variadic @hal.fence.await(%c-1_0, %zero, [%ref_5]) : (i32, i64, !vm.ref ...) -> i32 + vm.cond_br %0, ^bb2, ^bb1 + ^bb1: // pred: ^bb0 + %ref_6 = vm.call.variadic @hal.buffer_view.create(%ref_4, %zero, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref + %ref_7 = vm.call.variadic @hal.buffer_view.create(%ref_4, %c64, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref + vm.return %ref_6, %ref_7 : !vm.ref, !vm.ref + ^bb2: // pred: ^bb0 + vm.fail %0, "failed to wait on timepoint" + } + vm.export @multiple_results attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} + vm.export @__init + vm.func private @__init() { + %c1 = vm.const.i32 1 + %null = vm.const.ref.zero : !vm.buffer + %c14 = vm.const.i32 14 + %c-1 = vm.const.i64 -1 + %c18 = vm.const.i32 18 + %zero = vm.const.i32.zero + %zero_0 = vm.const.i64.zero + %c1_1 = vm.const.i64 1 + %null_2 = vm.const.ref.zero : !vm.ref + %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 + %1 = vm.ext.i32.i64.s %0 : i32 -> i64 + vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref) + ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 + %rnz = vm.cmp.nz.ref %4 : !vm.ref + %5 = vm.xor.i32 %rnz, %c1 : i32 + %slt = vm.cmp.lt.i64.s %2, %1 : i64 + %6 = vm.and.i32 %5, %slt : i32 + vm.cond_br %6, ^bb2, ^bb5 + ^bb2: // pred: ^bb1 + %7 = vm.trunc.i64.i32 %2 : i64 -> i32 + %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref + %_utf8_hal_device_id_C6650FF277232B5A = vm.const.ref.rodata @_utf8_hal_device_id_C6650FF277232B5A : !vm.buffer + %_utf8_local_1A8FF0278D7661D8 = vm.const.ref.rodata @_utf8_local_1A8FF0278D7661D8 : !vm.buffer + %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_C6650FF277232B5A, %_utf8_local_1A8FF0278D7661D8) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) + %nz = vm.cmp.nz.i64 %8#1 : i64 + %9 = vm.select.i32 %8#0, %nz, %zero : i32 + vm.cond_br %9, ^bb3, ^bb4(%zero : i32) + ^bb3: // pred: ^bb2 + %_utf8_hal_executable_format_E03EECB63A2AAF52 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer + %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer + %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_E03EECB63A2AAF52, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) + %nz_3 = vm.cmp.nz.i64 %10#1 : i64 + %11 = vm.select.i32 %10#0, %nz_3, %zero : i32 + vm.br ^bb4(%11 : i32) + ^bb4(%12: i32): // 2 preds: ^bb2, ^bb3 + %eq = vm.cmp.eq.i64 %3, %zero_0 : i64 + %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64 + %14 = vm.add.i64 %3, %13 : i64 + %15 = vm.and.i32 %12, %eq : i32 + %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref + %16 = vm.add.i64 %2, %c1_1 : i64 + vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref) + ^bb5: // pred: ^bb1 + vm.cond_br %5, ^bb6, ^bb7 + ^bb6: // pred: ^bb5 + vm.fail %c18, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>" + ^bb7: // pred: ^bb5 + %_utf8_hal_executable_format_E03EECB63A2AAF52_5 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer + %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer + %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_E03EECB63A2AAF52_5, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) + %nz_7 = vm.cmp.nz.i64 %17#1 : i64 + %18 = vm.select.i32 %17#0, %nz_7, %zero : i32 + %19 = vm.select.i64 %18, %zero_0, %c-1 : i64 + %eq_8 = vm.cmp.eq.i64 %19, %zero_0 : i64 + vm.global.store.ref %4, @__device_0 : !vm.ref + vm.cond_br %eq_8, ^bb8, ^bb9 + ^bb8: // pred: ^bb7 + %multiple_results_dispatch_0_embedded_elf_arm_64 = vm.const.ref.rodata @multiple_results_dispatch_0_embedded_elf_arm_64 : !vm.buffer + %ref_9 = vm.call @hal.executable.create(%4, %c-1, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6, %multiple_results_dispatch_0_embedded_elf_arm_64, %null) {nosideeffects} : (!vm.ref, i64, !vm.buffer, !vm.buffer, !vm.buffer) -> !vm.ref + vm.global.store.ref %ref_9, @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref + %ref_10 = vm.call @__multiple_results_memoize_apply() : () -> !vm.ref + vm.global.store.ref %ref_10, @__multiple_results_memoize_result_0_device_0 : !vm.ref + vm.return + ^bb9: // pred: ^bb7 + vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + } + vm.export @__deinit + vm.func private @__deinit() { + vm.return + } + } +} + + +// -----// IR Dump After DropEmptyModuleInitializersPass (iree-vm-drop-empty-module-initializers) //----- // +vm.module public @module { + vm.global.ref private mutable @__device_0 : !vm.ref + vm.global.ref private mutable @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref + vm.global.ref private mutable @__multiple_results_memoize_result_0_device_0 : !vm.ref + vm.rodata private @_utf8_hal_device_id_C6650FF277232B5A {alignment = 1 : i64} "hal.device.id" + vm.rodata private @_utf8_local_1A8FF0278D7661D8 {alignment = 1 : i64} "local*" + vm.rodata private @_utf8_hal_executable_format_E03EECB63A2AAF52 {alignment = 1 : i64} "hal.executable.format" + vm.rodata private @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 {alignment = 1 : i64} "embedded-elf-arm_64" + vm.rodata private @multiple_results_dispatch_0_embedded_elf_arm_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8> + vm.func private @__multiple_results_memoize_apply() -> !vm.ref attributes {inlining_policy = #util.inline.never} { + %c13 = vm.const.i32 13 + %c28 = vm.const.i32 28 + %c2 = vm.const.i32 2 + %null = vm.const.ref.zero : !vm.ref + %c1 = vm.const.i32 1 + %c3 = vm.const.i32 3 + %c64 = vm.const.i32 64 + %c128 = vm.const.i64 128 + %c8 = vm.const.i64 8 + %zero = vm.const.i64.zero + %zero_0 = vm.const.i32.zero + %c-1 = vm.const.i64 -1 + %__device_0 = vm.global.load.ref @__device_0 : !vm.ref + %__device_0_executable_0_multiple_results_dispatch_0 = vm.global.load.ref @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref + %ref = vm.call @hal.command_buffer.create(%__device_0, %zero_0, %c3, %c-1, %c3) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref + vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%zero_0], [(%zero_0, %zero_0, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64, i32 ..., tuple, i64, i64> ...) + vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%c64], [(%zero_0, %c1, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64, i32 ..., tuple, i64, i64> ...) + vm.call @hal.command_buffer.execution_barrier(%ref, %c28, %c13, %zero) : (!vm.ref, i32, i32, i64) -> () + vm.call @hal.command_buffer.finalize(%ref) : (!vm.ref) -> () + vm.return %ref : !vm.ref + } + vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) + vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) + vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 6 : i32} + vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) + vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i64) + vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple, i64, i64> ...) + vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects} + vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64, %flags : i64) -> !vm.ref + vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffer : !vm.ref, %flags : i64, %binding_table : tuple, i64, i64> ...) + vm.import private @hal.devices.count() -> i32 attributes {nosideeffects} + vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.executable.create(%device : !vm.ref, %queue_affinity : i64, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.fence.create(%device : !vm.ref, %flags : i64) -> !vm.ref + vm.import private @hal.fence.await(%timeout_millis : i32, %flags : i64, %fences : !vm.ref ...) -> i32 attributes {vm.yield} + vm.rodata private @_utf8_input0_DCE99660CEB3F6B {alignment = 1 : i64} "input0" + vm.rodata private @_utf8_tensor_FC1814BC4A58F22A {alignment = 1 : i64} "tensor" + vm.rodata private @_utf8_input1_B898B726583C85DA {alignment = 1 : i64} "input1" + vm.func private @multiple_results(%arg0: !vm.ref, %arg1: !vm.ref) -> (!vm.ref, !vm.ref) attributes {iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} { + %c16 = vm.const.i32 16 + %c1 = vm.const.i32 1 + %c553648160 = vm.const.i32 553648160 + %c3075 = vm.const.i32 3075 + %c48 = vm.const.i32 48 + %c2 = vm.const.i64 2 + %c8 = vm.const.i64 8 + %c64 = vm.const.i64 64 + %c128 = vm.const.i64 128 + %zero = vm.const.i64.zero + %c-1 = vm.const.i64 -1 + %null = vm.const.ref.zero : !vm.ref + %c-1_0 = vm.const.i32 -1 + %__device_0 = vm.global.load.ref @__device_0 : !vm.ref + %__multiple_results_memoize_result_0_device_0 = vm.global.load.ref @__multiple_results_memoize_result_0_device_0 : !vm.ref + %_utf8_input0_DCE99660CEB3F6B = vm.const.ref.rodata @_utf8_input0_DCE99660CEB3F6B : !vm.buffer + vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DCE99660CEB3F6B, %c553648160, %c1, [%c2]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) + %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref + %ref_1 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref + %_utf8_tensor_FC1814BC4A58F22A = vm.const.ref.rodata @_utf8_tensor_FC1814BC4A58F22A : !vm.buffer + vm.call @hal.buffer.assert(%ref, %_utf8_tensor_FC1814BC4A58F22A, %ref_1, %c8, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () + %_utf8_input1_B898B726583C85DA = vm.const.ref.rodata @_utf8_input1_B898B726583C85DA : !vm.buffer + vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_B898B726583C85DA, %c553648160, %c1, [%c2]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) + %ref_2 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref + vm.call @hal.buffer.assert(%ref_2, %_utf8_tensor_FC1814BC4A58F22A, %ref_1, %c8, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () + %ref_3 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i64) -> !vm.ref + %ref_4 = vm.call @hal.device.queue.alloca(%__device_0, %c-1, %null, %ref_3, %zero, %c48, %c3075, %c128, %zero) : (!vm.ref, i64, !vm.ref, !vm.ref, i64, i32, i32, i64, i64) -> !vm.ref + %ref_5 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i64) -> !vm.ref + vm.call.variadic @hal.device.queue.execute.indirect(%__device_0, %c-1, %ref_3, %ref_5, %__multiple_results_memoize_result_0_device_0, %zero, [(%ref, %zero, %c8), (%ref_2, %zero, %c8), (%ref_4, %zero, %c128)]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref, i64, tuple, i64, i64> ...) + %0 = vm.call.variadic @hal.fence.await(%c-1_0, %zero, [%ref_5]) : (i32, i64, !vm.ref ...) -> i32 + vm.cond_br %0, ^bb2, ^bb1 + ^bb1: // pred: ^bb0 + %ref_6 = vm.call.variadic @hal.buffer_view.create(%ref_4, %zero, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref + %ref_7 = vm.call.variadic @hal.buffer_view.create(%ref_4, %c64, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref + vm.return %ref_6, %ref_7 : !vm.ref, !vm.ref + ^bb2: // pred: ^bb0 + vm.fail %0, "failed to wait on timepoint" + } + vm.export @multiple_results attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} + vm.export @__init + vm.func private @__init() { + %c1 = vm.const.i32 1 + %null = vm.const.ref.zero : !vm.buffer + %c14 = vm.const.i32 14 + %c-1 = vm.const.i64 -1 + %c18 = vm.const.i32 18 + %zero = vm.const.i32.zero + %zero_0 = vm.const.i64.zero + %c1_1 = vm.const.i64 1 + %null_2 = vm.const.ref.zero : !vm.ref + %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 + %1 = vm.ext.i32.i64.s %0 : i32 -> i64 + vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref) + ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 + %rnz = vm.cmp.nz.ref %4 : !vm.ref + %5 = vm.xor.i32 %rnz, %c1 : i32 + %slt = vm.cmp.lt.i64.s %2, %1 : i64 + %6 = vm.and.i32 %5, %slt : i32 + vm.cond_br %6, ^bb2, ^bb5 + ^bb2: // pred: ^bb1 + %7 = vm.trunc.i64.i32 %2 : i64 -> i32 + %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref + %_utf8_hal_device_id_C6650FF277232B5A = vm.const.ref.rodata @_utf8_hal_device_id_C6650FF277232B5A : !vm.buffer + %_utf8_local_1A8FF0278D7661D8 = vm.const.ref.rodata @_utf8_local_1A8FF0278D7661D8 : !vm.buffer + %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_C6650FF277232B5A, %_utf8_local_1A8FF0278D7661D8) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) + %nz = vm.cmp.nz.i64 %8#1 : i64 + %9 = vm.select.i32 %8#0, %nz, %zero : i32 + vm.cond_br %9, ^bb3, ^bb4(%zero : i32) + ^bb3: // pred: ^bb2 + %_utf8_hal_executable_format_E03EECB63A2AAF52 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer + %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer + %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_E03EECB63A2AAF52, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) + %nz_3 = vm.cmp.nz.i64 %10#1 : i64 + %11 = vm.select.i32 %10#0, %nz_3, %zero : i32 + vm.br ^bb4(%11 : i32) + ^bb4(%12: i32): // 2 preds: ^bb2, ^bb3 + %eq = vm.cmp.eq.i64 %3, %zero_0 : i64 + %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64 + %14 = vm.add.i64 %3, %13 : i64 + %15 = vm.and.i32 %12, %eq : i32 + %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref + %16 = vm.add.i64 %2, %c1_1 : i64 + vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref) + ^bb5: // pred: ^bb1 + vm.cond_br %5, ^bb6, ^bb7 + ^bb6: // pred: ^bb5 + vm.fail %c18, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>" + ^bb7: // pred: ^bb5 + %_utf8_hal_executable_format_E03EECB63A2AAF52_5 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer + %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer + %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_E03EECB63A2AAF52_5, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) + %nz_7 = vm.cmp.nz.i64 %17#1 : i64 + %18 = vm.select.i32 %17#0, %nz_7, %zero : i32 + %19 = vm.select.i64 %18, %zero_0, %c-1 : i64 + %eq_8 = vm.cmp.eq.i64 %19, %zero_0 : i64 + vm.global.store.ref %4, @__device_0 : !vm.ref + vm.cond_br %eq_8, ^bb8, ^bb9 + ^bb8: // pred: ^bb7 + %multiple_results_dispatch_0_embedded_elf_arm_64 = vm.const.ref.rodata @multiple_results_dispatch_0_embedded_elf_arm_64 : !vm.buffer + %ref_9 = vm.call @hal.executable.create(%4, %c-1, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6, %multiple_results_dispatch_0_embedded_elf_arm_64, %null) {nosideeffects} : (!vm.ref, i64, !vm.buffer, !vm.buffer, !vm.buffer) -> !vm.ref + vm.global.store.ref %ref_9, @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref + %ref_10 = vm.call @__multiple_results_memoize_apply() : () -> !vm.ref + vm.global.store.ref %ref_10, @__multiple_results_memoize_result_0_device_0 : !vm.ref + vm.return + ^bb9: // pred: ^bb7 + vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + } +} + +// -----// IR Dump After AnnotateFunctionsPass (iree-vm-annotate-functions) //----- // +vm.module public @module { + vm.global.ref private mutable @__device_0 : !vm.ref + vm.global.ref private mutable @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref + vm.global.ref private mutable @__multiple_results_memoize_result_0_device_0 : !vm.ref + vm.rodata private @_utf8_hal_device_id_C6650FF277232B5A {alignment = 1 : i64} "hal.device.id" + vm.rodata private @_utf8_local_1A8FF0278D7661D8 {alignment = 1 : i64} "local*" + vm.rodata private @_utf8_hal_executable_format_E03EECB63A2AAF52 {alignment = 1 : i64} "hal.executable.format" + vm.rodata private @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 {alignment = 1 : i64} "embedded-elf-arm_64" + vm.rodata private @multiple_results_dispatch_0_embedded_elf_arm_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8> + vm.func private @__multiple_results_memoize_apply() -> !vm.ref attributes {inlining_policy = #util.inline.never, vm.unwind} { + %c13 = vm.const.i32 13 + %c28 = vm.const.i32 28 + %c2 = vm.const.i32 2 + %null = vm.const.ref.zero : !vm.ref + %c1 = vm.const.i32 1 + %c3 = vm.const.i32 3 + %c64 = vm.const.i32 64 + %c128 = vm.const.i64 128 + %c8 = vm.const.i64 8 + %zero = vm.const.i64.zero + %zero_0 = vm.const.i32.zero + %c-1 = vm.const.i64 -1 + %__device_0 = vm.global.load.ref @__device_0 : !vm.ref + %__device_0_executable_0_multiple_results_dispatch_0 = vm.global.load.ref @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref + %ref = vm.call @hal.command_buffer.create(%__device_0, %zero_0, %c3, %c-1, %c3) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref + vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%zero_0], [(%zero_0, %zero_0, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64, i32 ..., tuple, i64, i64> ...) + vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%c64], [(%zero_0, %c1, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64, i32 ..., tuple, i64, i64> ...) + vm.call @hal.command_buffer.execution_barrier(%ref, %c28, %c13, %zero) : (!vm.ref, i32, i32, i64) -> () + vm.call @hal.command_buffer.finalize(%ref) : (!vm.ref) -> () + vm.return %ref : !vm.ref + } + vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) + vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) + vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 6 : i32} + vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) + vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i64) + vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple, i64, i64> ...) + vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects} + vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64, %flags : i64) -> !vm.ref + vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffer : !vm.ref, %flags : i64, %binding_table : tuple, i64, i64> ...) + vm.import private @hal.devices.count() -> i32 attributes {nosideeffects} + vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.executable.create(%device : !vm.ref, %queue_affinity : i64, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.fence.create(%device : !vm.ref, %flags : i64) -> !vm.ref + vm.import private @hal.fence.await(%timeout_millis : i32, %flags : i64, %fences : !vm.ref ...) -> i32 attributes {vm.yield} + vm.rodata private @_utf8_input0_DCE99660CEB3F6B {alignment = 1 : i64} "input0" + vm.rodata private @_utf8_tensor_FC1814BC4A58F22A {alignment = 1 : i64} "tensor" + vm.rodata private @_utf8_input1_B898B726583C85DA {alignment = 1 : i64} "input1" + vm.func private @multiple_results(%arg0: !vm.ref, %arg1: !vm.ref) -> (!vm.ref, !vm.ref) attributes {iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}, vm.unwind, vm.yield} { + %c16 = vm.const.i32 16 + %c1 = vm.const.i32 1 + %c553648160 = vm.const.i32 553648160 + %c3075 = vm.const.i32 3075 + %c48 = vm.const.i32 48 + %c2 = vm.const.i64 2 + %c8 = vm.const.i64 8 + %c64 = vm.const.i64 64 + %c128 = vm.const.i64 128 + %zero = vm.const.i64.zero + %c-1 = vm.const.i64 -1 + %null = vm.const.ref.zero : !vm.ref + %c-1_0 = vm.const.i32 -1 + %__device_0 = vm.global.load.ref @__device_0 : !vm.ref + %__multiple_results_memoize_result_0_device_0 = vm.global.load.ref @__multiple_results_memoize_result_0_device_0 : !vm.ref + %_utf8_input0_DCE99660CEB3F6B = vm.const.ref.rodata @_utf8_input0_DCE99660CEB3F6B : !vm.buffer + vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DCE99660CEB3F6B, %c553648160, %c1, [%c2]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) + %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref + %ref_1 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref + %_utf8_tensor_FC1814BC4A58F22A = vm.const.ref.rodata @_utf8_tensor_FC1814BC4A58F22A : !vm.buffer + vm.call @hal.buffer.assert(%ref, %_utf8_tensor_FC1814BC4A58F22A, %ref_1, %c8, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () + %_utf8_input1_B898B726583C85DA = vm.const.ref.rodata @_utf8_input1_B898B726583C85DA : !vm.buffer + vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_B898B726583C85DA, %c553648160, %c1, [%c2]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) + %ref_2 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref + vm.call @hal.buffer.assert(%ref_2, %_utf8_tensor_FC1814BC4A58F22A, %ref_1, %c8, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () + %ref_3 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i64) -> !vm.ref + %ref_4 = vm.call @hal.device.queue.alloca(%__device_0, %c-1, %null, %ref_3, %zero, %c48, %c3075, %c128, %zero) : (!vm.ref, i64, !vm.ref, !vm.ref, i64, i32, i32, i64, i64) -> !vm.ref + %ref_5 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i64) -> !vm.ref + vm.call.variadic @hal.device.queue.execute.indirect(%__device_0, %c-1, %ref_3, %ref_5, %__multiple_results_memoize_result_0_device_0, %zero, [(%ref, %zero, %c8), (%ref_2, %zero, %c8), (%ref_4, %zero, %c128)]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref, i64, tuple, i64, i64> ...) + %0 = vm.call.variadic @hal.fence.await(%c-1_0, %zero, [%ref_5]) : (i32, i64, !vm.ref ...) -> i32 + vm.cond_br %0, ^bb2, ^bb1 + ^bb1: // pred: ^bb0 + %ref_6 = vm.call.variadic @hal.buffer_view.create(%ref_4, %zero, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref + %ref_7 = vm.call.variadic @hal.buffer_view.create(%ref_4, %c64, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref + vm.return %ref_6, %ref_7 : !vm.ref, !vm.ref + ^bb2: // pred: ^bb0 + vm.fail %0, "failed to wait on timepoint" + } + vm.export @multiple_results attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} + vm.export @__init + vm.func private @__init() attributes {vm.unwind} { + %c1 = vm.const.i32 1 + %null = vm.const.ref.zero : !vm.buffer + %c14 = vm.const.i32 14 + %c-1 = vm.const.i64 -1 + %c18 = vm.const.i32 18 + %zero = vm.const.i32.zero + %zero_0 = vm.const.i64.zero + %c1_1 = vm.const.i64 1 + %null_2 = vm.const.ref.zero : !vm.ref + %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 + %1 = vm.ext.i32.i64.s %0 : i32 -> i64 + vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref) + ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 + %rnz = vm.cmp.nz.ref %4 : !vm.ref + %5 = vm.xor.i32 %rnz, %c1 : i32 + %slt = vm.cmp.lt.i64.s %2, %1 : i64 + %6 = vm.and.i32 %5, %slt : i32 + vm.cond_br %6, ^bb2, ^bb5 + ^bb2: // pred: ^bb1 + %7 = vm.trunc.i64.i32 %2 : i64 -> i32 + %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref + %_utf8_hal_device_id_C6650FF277232B5A = vm.const.ref.rodata @_utf8_hal_device_id_C6650FF277232B5A : !vm.buffer + %_utf8_local_1A8FF0278D7661D8 = vm.const.ref.rodata @_utf8_local_1A8FF0278D7661D8 : !vm.buffer + %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_C6650FF277232B5A, %_utf8_local_1A8FF0278D7661D8) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) + %nz = vm.cmp.nz.i64 %8#1 : i64 + %9 = vm.select.i32 %8#0, %nz, %zero : i32 + vm.cond_br %9, ^bb3, ^bb4(%zero : i32) + ^bb3: // pred: ^bb2 + %_utf8_hal_executable_format_E03EECB63A2AAF52 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer + %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer + %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_E03EECB63A2AAF52, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) + %nz_3 = vm.cmp.nz.i64 %10#1 : i64 + %11 = vm.select.i32 %10#0, %nz_3, %zero : i32 + vm.br ^bb4(%11 : i32) + ^bb4(%12: i32): // 2 preds: ^bb2, ^bb3 + %eq = vm.cmp.eq.i64 %3, %zero_0 : i64 + %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64 + %14 = vm.add.i64 %3, %13 : i64 + %15 = vm.and.i32 %12, %eq : i32 + %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref + %16 = vm.add.i64 %2, %c1_1 : i64 + vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref) + ^bb5: // pred: ^bb1 + vm.cond_br %5, ^bb6, ^bb7 + ^bb6: // pred: ^bb5 + vm.fail %c18, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>" + ^bb7: // pred: ^bb5 + %_utf8_hal_executable_format_E03EECB63A2AAF52_5 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer + %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer + %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_E03EECB63A2AAF52_5, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) + %nz_7 = vm.cmp.nz.i64 %17#1 : i64 + %18 = vm.select.i32 %17#0, %nz_7, %zero : i32 + %19 = vm.select.i64 %18, %zero_0, %c-1 : i64 + %eq_8 = vm.cmp.eq.i64 %19, %zero_0 : i64 + vm.global.store.ref %4, @__device_0 : !vm.ref + vm.cond_br %eq_8, ^bb8, ^bb9 + ^bb8: // pred: ^bb7 + %multiple_results_dispatch_0_embedded_elf_arm_64 = vm.const.ref.rodata @multiple_results_dispatch_0_embedded_elf_arm_64 : !vm.buffer + %ref_9 = vm.call @hal.executable.create(%4, %c-1, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6, %multiple_results_dispatch_0_embedded_elf_arm_64, %null) {nosideeffects} : (!vm.ref, i64, !vm.buffer, !vm.buffer, !vm.buffer) -> !vm.ref + vm.global.store.ref %ref_9, @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref + %ref_10 = vm.call @__multiple_results_memoize_apply() : () -> !vm.ref + vm.global.store.ref %ref_10, @__multiple_results_memoize_result_0_device_0 : !vm.ref + vm.return + ^bb9: // pred: ^bb7 + vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + } +} + +// -----// IR Dump After ConvertToYieldableCallsPass (iree-vm-convert-to-yieldable-calls) //----- // +vm.module public @module { + vm.global.ref private mutable @__device_0 : !vm.ref + vm.global.ref private mutable @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref + vm.global.ref private mutable @__multiple_results_memoize_result_0_device_0 : !vm.ref + vm.rodata private @_utf8_hal_device_id_C6650FF277232B5A {alignment = 1 : i64} "hal.device.id" + vm.rodata private @_utf8_local_1A8FF0278D7661D8 {alignment = 1 : i64} "local*" + vm.rodata private @_utf8_hal_executable_format_E03EECB63A2AAF52 {alignment = 1 : i64} "hal.executable.format" + vm.rodata private @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 {alignment = 1 : i64} "embedded-elf-arm_64" + vm.rodata private @multiple_results_dispatch_0_embedded_elf_arm_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8> + vm.func private @__multiple_results_memoize_apply() -> !vm.ref attributes {inlining_policy = #util.inline.never, vm.unwind} { + %c13 = vm.const.i32 13 + %c28 = vm.const.i32 28 + %c2 = vm.const.i32 2 + %null = vm.const.ref.zero : !vm.ref + %c1 = vm.const.i32 1 + %c3 = vm.const.i32 3 + %c64 = vm.const.i32 64 + %c128 = vm.const.i64 128 + %c8 = vm.const.i64 8 + %zero = vm.const.i64.zero + %zero_0 = vm.const.i32.zero + %c-1 = vm.const.i64 -1 + %__device_0 = vm.global.load.ref @__device_0 : !vm.ref + %__device_0_executable_0_multiple_results_dispatch_0 = vm.global.load.ref @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref + %ref = vm.call @hal.command_buffer.create(%__device_0, %zero_0, %c3, %c-1, %c3) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref + vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%zero_0], [(%zero_0, %zero_0, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64, i32 ..., tuple, i64, i64> ...) + vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%c64], [(%zero_0, %c1, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64, i32 ..., tuple, i64, i64> ...) + vm.call @hal.command_buffer.execution_barrier(%ref, %c28, %c13, %zero) : (!vm.ref, i32, i32, i64) -> () + vm.call @hal.command_buffer.finalize(%ref) : (!vm.ref) -> () + vm.return %ref : !vm.ref + } + vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) + vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) + vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 6 : i32} + vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) + vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i64) + vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple, i64, i64> ...) + vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects} + vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64, %flags : i64) -> !vm.ref + vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffer : !vm.ref, %flags : i64, %binding_table : tuple, i64, i64> ...) + vm.import private @hal.devices.count() -> i32 attributes {nosideeffects} + vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.executable.create(%device : !vm.ref, %queue_affinity : i64, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.fence.create(%device : !vm.ref, %flags : i64) -> !vm.ref + vm.import private @hal.fence.await(%timeout_millis : i32, %flags : i64, %fences : !vm.ref ...) -> i32 attributes {vm.yield} + vm.rodata private @_utf8_input0_DCE99660CEB3F6B {alignment = 1 : i64} "input0" + vm.rodata private @_utf8_tensor_FC1814BC4A58F22A {alignment = 1 : i64} "tensor" + vm.rodata private @_utf8_input1_B898B726583C85DA {alignment = 1 : i64} "input1" + vm.func private @multiple_results(%arg0: !vm.ref, %arg1: !vm.ref) -> (!vm.ref, !vm.ref) attributes {iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}, vm.unwind, vm.yield} { + %c16 = vm.const.i32 16 + %c1 = vm.const.i32 1 + %c553648160 = vm.const.i32 553648160 + %c3075 = vm.const.i32 3075 + %c48 = vm.const.i32 48 + %c2 = vm.const.i64 2 + %c8 = vm.const.i64 8 + %c64 = vm.const.i64 64 + %c128 = vm.const.i64 128 + %zero = vm.const.i64.zero + %c-1 = vm.const.i64 -1 + %null = vm.const.ref.zero : !vm.ref + %c-1_0 = vm.const.i32 -1 + %__device_0 = vm.global.load.ref @__device_0 : !vm.ref + %__multiple_results_memoize_result_0_device_0 = vm.global.load.ref @__multiple_results_memoize_result_0_device_0 : !vm.ref + %_utf8_input0_DCE99660CEB3F6B = vm.const.ref.rodata @_utf8_input0_DCE99660CEB3F6B : !vm.buffer + vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DCE99660CEB3F6B, %c553648160, %c1, [%c2]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) + %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref + %ref_1 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref + %_utf8_tensor_FC1814BC4A58F22A = vm.const.ref.rodata @_utf8_tensor_FC1814BC4A58F22A : !vm.buffer + vm.call @hal.buffer.assert(%ref, %_utf8_tensor_FC1814BC4A58F22A, %ref_1, %c8, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () + %_utf8_input1_B898B726583C85DA = vm.const.ref.rodata @_utf8_input1_B898B726583C85DA : !vm.buffer + vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_B898B726583C85DA, %c553648160, %c1, [%c2]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) + %ref_2 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref + vm.call @hal.buffer.assert(%ref_2, %_utf8_tensor_FC1814BC4A58F22A, %ref_1, %c8, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () + %ref_3 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i64) -> !vm.ref + %ref_4 = vm.call @hal.device.queue.alloca(%__device_0, %c-1, %null, %ref_3, %zero, %c48, %c3075, %c128, %zero) : (!vm.ref, i64, !vm.ref, !vm.ref, i64, i32, i32, i64, i64) -> !vm.ref + %ref_5 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i64) -> !vm.ref + vm.call.variadic @hal.device.queue.execute.indirect(%__device_0, %c-1, %ref_3, %ref_5, %__multiple_results_memoize_result_0_device_0, %zero, [(%ref, %zero, %c8), (%ref_2, %zero, %c8), (%ref_4, %zero, %c128)]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref, i64, tuple, i64, i64> ...) + vm.call.variadic.yieldable @hal.fence.await(%c-1_0, %zero, %ref_5) {segment_sizes = dense<[-1, -1, 1]> : vector<3xi16>, segment_types = [i32, i64, !vm.ref]} : (i32, i64, !vm.ref) -> ^bb1 (i32) + ^bb1(%0: i32): // pred: ^bb0 + vm.cond_br %0, ^bb3, ^bb2 + ^bb2: // pred: ^bb1 + %ref_6 = vm.call.variadic @hal.buffer_view.create(%ref_4, %zero, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref + %ref_7 = vm.call.variadic @hal.buffer_view.create(%ref_4, %c64, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref + vm.return %ref_6, %ref_7 : !vm.ref, !vm.ref + ^bb3: // pred: ^bb1 + vm.fail %0, "failed to wait on timepoint" + } + vm.export @multiple_results attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} + vm.export @__init + vm.func private @__init() attributes {vm.unwind} { + %c1 = vm.const.i32 1 + %null = vm.const.ref.zero : !vm.buffer + %c14 = vm.const.i32 14 + %c-1 = vm.const.i64 -1 + %c18 = vm.const.i32 18 + %zero = vm.const.i32.zero + %zero_0 = vm.const.i64.zero + %c1_1 = vm.const.i64 1 + %null_2 = vm.const.ref.zero : !vm.ref + %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 + %1 = vm.ext.i32.i64.s %0 : i32 -> i64 + vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref) + ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 + %rnz = vm.cmp.nz.ref %4 : !vm.ref + %5 = vm.xor.i32 %rnz, %c1 : i32 + %slt = vm.cmp.lt.i64.s %2, %1 : i64 + %6 = vm.and.i32 %5, %slt : i32 + vm.cond_br %6, ^bb2, ^bb5 + ^bb2: // pred: ^bb1 + %7 = vm.trunc.i64.i32 %2 : i64 -> i32 + %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref + %_utf8_hal_device_id_C6650FF277232B5A = vm.const.ref.rodata @_utf8_hal_device_id_C6650FF277232B5A : !vm.buffer + %_utf8_local_1A8FF0278D7661D8 = vm.const.ref.rodata @_utf8_local_1A8FF0278D7661D8 : !vm.buffer + %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_C6650FF277232B5A, %_utf8_local_1A8FF0278D7661D8) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) + %nz = vm.cmp.nz.i64 %8#1 : i64 + %9 = vm.select.i32 %8#0, %nz, %zero : i32 + vm.cond_br %9, ^bb3, ^bb4(%zero : i32) + ^bb3: // pred: ^bb2 + %_utf8_hal_executable_format_E03EECB63A2AAF52 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer + %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer + %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_E03EECB63A2AAF52, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) + %nz_3 = vm.cmp.nz.i64 %10#1 : i64 + %11 = vm.select.i32 %10#0, %nz_3, %zero : i32 + vm.br ^bb4(%11 : i32) + ^bb4(%12: i32): // 2 preds: ^bb2, ^bb3 + %eq = vm.cmp.eq.i64 %3, %zero_0 : i64 + %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64 + %14 = vm.add.i64 %3, %13 : i64 + %15 = vm.and.i32 %12, %eq : i32 + %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref + %16 = vm.add.i64 %2, %c1_1 : i64 + vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref) + ^bb5: // pred: ^bb1 + vm.cond_br %5, ^bb6, ^bb7 + ^bb6: // pred: ^bb5 + vm.fail %c18, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>" + ^bb7: // pred: ^bb5 + %_utf8_hal_executable_format_E03EECB63A2AAF52_5 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer + %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer + %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_E03EECB63A2AAF52_5, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) + %nz_7 = vm.cmp.nz.i64 %17#1 : i64 + %18 = vm.select.i32 %17#0, %nz_7, %zero : i32 + %19 = vm.select.i64 %18, %zero_0, %c-1 : i64 + %eq_8 = vm.cmp.eq.i64 %19, %zero_0 : i64 + vm.global.store.ref %4, @__device_0 : !vm.ref + vm.cond_br %eq_8, ^bb8, ^bb9 + ^bb8: // pred: ^bb7 + %multiple_results_dispatch_0_embedded_elf_arm_64 = vm.const.ref.rodata @multiple_results_dispatch_0_embedded_elf_arm_64 : !vm.buffer + %ref_9 = vm.call @hal.executable.create(%4, %c-1, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6, %multiple_results_dispatch_0_embedded_elf_arm_64, %null) {nosideeffects} : (!vm.ref, i64, !vm.buffer, !vm.buffer, !vm.buffer) -> !vm.ref + vm.global.store.ref %ref_9, @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref + %ref_10 = vm.call @__multiple_results_memoize_apply() : () -> !vm.ref + vm.global.store.ref %ref_10, @__multiple_results_memoize_result_0_device_0 : !vm.ref + vm.return + ^bb9: // pred: ^bb7 + vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + } +} + +// -----// IR Dump After DropOptimizationBarriersPass (iree-vm-drop-optimization-barriers) //----- // +vm.module public @module { + vm.global.ref private mutable @__device_0 : !vm.ref + vm.global.ref private mutable @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref + vm.global.ref private mutable @__multiple_results_memoize_result_0_device_0 : !vm.ref + vm.rodata private @_utf8_hal_device_id_C6650FF277232B5A {alignment = 1 : i64} "hal.device.id" + vm.rodata private @_utf8_local_1A8FF0278D7661D8 {alignment = 1 : i64} "local*" + vm.rodata private @_utf8_hal_executable_format_E03EECB63A2AAF52 {alignment = 1 : i64} "hal.executable.format" + vm.rodata private @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 {alignment = 1 : i64} "embedded-elf-arm_64" + vm.rodata private @multiple_results_dispatch_0_embedded_elf_arm_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8> + vm.func private @__multiple_results_memoize_apply() -> !vm.ref attributes {inlining_policy = #util.inline.never, vm.unwind} { + %c13 = vm.const.i32 13 + %c28 = vm.const.i32 28 + %c2 = vm.const.i32 2 + %null = vm.const.ref.zero : !vm.ref + %c1 = vm.const.i32 1 + %c3 = vm.const.i32 3 + %c64 = vm.const.i32 64 + %c128 = vm.const.i64 128 + %c8 = vm.const.i64 8 + %zero = vm.const.i64.zero + %zero_0 = vm.const.i32.zero + %c-1 = vm.const.i64 -1 + %__device_0 = vm.global.load.ref @__device_0 : !vm.ref + %__device_0_executable_0_multiple_results_dispatch_0 = vm.global.load.ref @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref + %ref = vm.call @hal.command_buffer.create(%__device_0, %zero_0, %c3, %c-1, %c3) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref + vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%zero_0], [(%zero_0, %zero_0, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64, i32 ..., tuple, i64, i64> ...) + vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%c64], [(%zero_0, %c1, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64, i32 ..., tuple, i64, i64> ...) + vm.call @hal.command_buffer.execution_barrier(%ref, %c28, %c13, %zero) : (!vm.ref, i32, i32, i64) -> () + vm.call @hal.command_buffer.finalize(%ref) : (!vm.ref) -> () + vm.return %ref : !vm.ref + } + vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) + vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) + vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 6 : i32} + vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) + vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i64) + vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple, i64, i64> ...) + vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects} + vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64, %flags : i64) -> !vm.ref + vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffer : !vm.ref, %flags : i64, %binding_table : tuple, i64, i64> ...) + vm.import private @hal.devices.count() -> i32 attributes {nosideeffects} + vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.executable.create(%device : !vm.ref, %queue_affinity : i64, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.fence.create(%device : !vm.ref, %flags : i64) -> !vm.ref + vm.import private @hal.fence.await(%timeout_millis : i32, %flags : i64, %fences : !vm.ref ...) -> i32 attributes {vm.yield} + vm.rodata private @_utf8_input0_DCE99660CEB3F6B {alignment = 1 : i64} "input0" + vm.rodata private @_utf8_tensor_FC1814BC4A58F22A {alignment = 1 : i64} "tensor" + vm.rodata private @_utf8_input1_B898B726583C85DA {alignment = 1 : i64} "input1" + vm.func private @multiple_results(%arg0: !vm.ref, %arg1: !vm.ref) -> (!vm.ref, !vm.ref) attributes {iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}, vm.unwind, vm.yield} { + %c16 = vm.const.i32 16 + %c1 = vm.const.i32 1 + %c553648160 = vm.const.i32 553648160 + %c3075 = vm.const.i32 3075 + %c48 = vm.const.i32 48 + %c2 = vm.const.i64 2 + %c8 = vm.const.i64 8 + %c64 = vm.const.i64 64 + %c128 = vm.const.i64 128 + %zero = vm.const.i64.zero + %c-1 = vm.const.i64 -1 + %null = vm.const.ref.zero : !vm.ref + %c-1_0 = vm.const.i32 -1 + %__device_0 = vm.global.load.ref @__device_0 : !vm.ref + %__multiple_results_memoize_result_0_device_0 = vm.global.load.ref @__multiple_results_memoize_result_0_device_0 : !vm.ref + %_utf8_input0_DCE99660CEB3F6B = vm.const.ref.rodata @_utf8_input0_DCE99660CEB3F6B : !vm.buffer + vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DCE99660CEB3F6B, %c553648160, %c1, [%c2]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) + %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref + %ref_1 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref + %_utf8_tensor_FC1814BC4A58F22A = vm.const.ref.rodata @_utf8_tensor_FC1814BC4A58F22A : !vm.buffer + vm.call @hal.buffer.assert(%ref, %_utf8_tensor_FC1814BC4A58F22A, %ref_1, %c8, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () + %_utf8_input1_B898B726583C85DA = vm.const.ref.rodata @_utf8_input1_B898B726583C85DA : !vm.buffer + vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_B898B726583C85DA, %c553648160, %c1, [%c2]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) + %ref_2 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref + vm.call @hal.buffer.assert(%ref_2, %_utf8_tensor_FC1814BC4A58F22A, %ref_1, %c8, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () + %ref_3 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i64) -> !vm.ref + %ref_4 = vm.call @hal.device.queue.alloca(%__device_0, %c-1, %null, %ref_3, %zero, %c48, %c3075, %c128, %zero) : (!vm.ref, i64, !vm.ref, !vm.ref, i64, i32, i32, i64, i64) -> !vm.ref + %ref_5 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i64) -> !vm.ref + vm.call.variadic @hal.device.queue.execute.indirect(%__device_0, %c-1, %ref_3, %ref_5, %__multiple_results_memoize_result_0_device_0, %zero, [(%ref, %zero, %c8), (%ref_2, %zero, %c8), (%ref_4, %zero, %c128)]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref, i64, tuple, i64, i64> ...) + vm.call.variadic.yieldable @hal.fence.await(%c-1_0, %zero, %ref_5) {segment_sizes = dense<[-1, -1, 1]> : vector<3xi16>, segment_types = [i32, i64, !vm.ref]} : (i32, i64, !vm.ref) -> ^bb1 (i32) + ^bb1(%0: i32): // pred: ^bb0 + vm.cond_br %0, ^bb3, ^bb2 + ^bb2: // pred: ^bb1 + %ref_6 = vm.call.variadic @hal.buffer_view.create(%ref_4, %zero, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref + %ref_7 = vm.call.variadic @hal.buffer_view.create(%ref_4, %c64, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref + vm.return %ref_6, %ref_7 : !vm.ref, !vm.ref + ^bb3: // pred: ^bb1 + vm.fail %0, "failed to wait on timepoint" + } + vm.export @multiple_results attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} + vm.export @__init + vm.func private @__init() attributes {vm.unwind} { + %c1 = vm.const.i32 1 + %null = vm.const.ref.zero : !vm.buffer + %c14 = vm.const.i32 14 + %c-1 = vm.const.i64 -1 + %c18 = vm.const.i32 18 + %zero = vm.const.i32.zero + %zero_0 = vm.const.i64.zero + %c1_1 = vm.const.i64 1 + %null_2 = vm.const.ref.zero : !vm.ref + %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 + %1 = vm.ext.i32.i64.s %0 : i32 -> i64 + vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref) + ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 + %rnz = vm.cmp.nz.ref %4 : !vm.ref + %5 = vm.xor.i32 %rnz, %c1 : i32 + %slt = vm.cmp.lt.i64.s %2, %1 : i64 + %6 = vm.and.i32 %5, %slt : i32 + vm.cond_br %6, ^bb2, ^bb5 + ^bb2: // pred: ^bb1 + %7 = vm.trunc.i64.i32 %2 : i64 -> i32 + %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref + %_utf8_hal_device_id_C6650FF277232B5A = vm.const.ref.rodata @_utf8_hal_device_id_C6650FF277232B5A : !vm.buffer + %_utf8_local_1A8FF0278D7661D8 = vm.const.ref.rodata @_utf8_local_1A8FF0278D7661D8 : !vm.buffer + %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_C6650FF277232B5A, %_utf8_local_1A8FF0278D7661D8) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) + %nz = vm.cmp.nz.i64 %8#1 : i64 + %9 = vm.select.i32 %8#0, %nz, %zero : i32 + vm.cond_br %9, ^bb3, ^bb4(%zero : i32) + ^bb3: // pred: ^bb2 + %_utf8_hal_executable_format_E03EECB63A2AAF52 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer + %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer + %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_E03EECB63A2AAF52, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) + %nz_3 = vm.cmp.nz.i64 %10#1 : i64 + %11 = vm.select.i32 %10#0, %nz_3, %zero : i32 + vm.br ^bb4(%11 : i32) + ^bb4(%12: i32): // 2 preds: ^bb2, ^bb3 + %eq = vm.cmp.eq.i64 %3, %zero_0 : i64 + %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64 + %14 = vm.add.i64 %3, %13 : i64 + %15 = vm.and.i32 %12, %eq : i32 + %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref + %16 = vm.add.i64 %2, %c1_1 : i64 + vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref) + ^bb5: // pred: ^bb1 + vm.cond_br %5, ^bb6, ^bb7 + ^bb6: // pred: ^bb5 + vm.fail %c18, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>" + ^bb7: // pred: ^bb5 + %_utf8_hal_executable_format_E03EECB63A2AAF52_5 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer + %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer + %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_E03EECB63A2AAF52_5, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) + %nz_7 = vm.cmp.nz.i64 %17#1 : i64 + %18 = vm.select.i32 %17#0, %nz_7, %zero : i32 + %19 = vm.select.i64 %18, %zero_0, %c-1 : i64 + %eq_8 = vm.cmp.eq.i64 %19, %zero_0 : i64 + vm.global.store.ref %4, @__device_0 : !vm.ref + vm.cond_br %eq_8, ^bb8, ^bb9 + ^bb8: // pred: ^bb7 + %multiple_results_dispatch_0_embedded_elf_arm_64 = vm.const.ref.rodata @multiple_results_dispatch_0_embedded_elf_arm_64 : !vm.buffer + %ref_9 = vm.call @hal.executable.create(%4, %c-1, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6, %multiple_results_dispatch_0_embedded_elf_arm_64, %null) {nosideeffects} : (!vm.ref, i64, !vm.buffer, !vm.buffer, !vm.buffer) -> !vm.ref + vm.global.store.ref %ref_9, @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref + %ref_10 = vm.call @__multiple_results_memoize_apply() : () -> !vm.ref + vm.global.store.ref %ref_10, @__multiple_results_memoize_result_0_device_0 : !vm.ref + vm.return + ^bb9: // pred: ^bb7 + vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + } +} + +// -----// IR Dump After MaterializeRefDiscardsPass (iree-vm-materialize-ref-discards) //----- // +vm.module public @module { + vm.global.ref private mutable @__device_0 : !vm.ref + vm.global.ref private mutable @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref + vm.global.ref private mutable @__multiple_results_memoize_result_0_device_0 : !vm.ref + vm.rodata private @_utf8_hal_device_id_C6650FF277232B5A {alignment = 1 : i64} "hal.device.id" + vm.rodata private @_utf8_local_1A8FF0278D7661D8 {alignment = 1 : i64} "local*" + vm.rodata private @_utf8_hal_executable_format_E03EECB63A2AAF52 {alignment = 1 : i64} "hal.executable.format" + vm.rodata private @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 {alignment = 1 : i64} "embedded-elf-arm_64" + vm.rodata private @multiple_results_dispatch_0_embedded_elf_arm_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C460201010000000000000000000300B70001000000000000000000000040000000000000000809000000000000000000004000380007004000150013000600000004000000400000000000000040000000000000004000000000000000880100000000000088010000000000000800000000000000010000000400000000000000000000000000000000000000000000000000000070040000000000007004000000000000000001000000000001000000050000007004000000000000700401000000000070040100000000004400000000000000440000000000000000000100000000000100000006000000C004000000000000C004020000000000C004020000000000A001000000000000400B00000000000000000100000000000200000006000000A005000000000000A005020000000000A005020000000000C000000000000000C000000000000000080000000000000052E5746404000000C004000000000000C004020000000000C004020000000000A001000000000000400B000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000012000700A004010000000000140000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000C80402000000000003040000000000006003000000000000D80402000000000003040000000000007004010000000000E0040200000000000304000000000000C003000000000000F0040200000000000304000000000000EE0300000000000008050200000000000304000000000000280400000000000010050200000000000304000000000000280400000000000020050200000000000304000000000000C00402000000000040050200000000000304000000000000D80402000000000048050200000000000304000000000000800300000000000060050200000000000304000000000000E00402000000000078050200000000000304000000000000E80402000000000080050200000000000304000000000000000502000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F300000000000000000000000000000000102010000000100000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332002E2F6578706572696D656E74616C2F7765622F73616D706C655F7765626770752F6D756C7469706C655F726573756C74732E6D6C6972000000001000000000000000017A5200017C1E011B0C1F0018000000180000002C0001003000000000480C1D109E029D04000000100000003400000040000100140000000000000000000000FD7BBFA9FD0300912AA041A9E0031F2A092140A9200140FD490140B900F8A00E29757E92006929FCFD7BC1A8C0035FD61F1800711F2003D5C803081000019F9AC0035FD600000000000000000000000006000000000000000000000000000000000000000000000000000000000000000000000000000000010000003600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000200100000000000009000000000000001800000000000000F9FFFF6F000000000C000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E1017B44219110112060000022E001101120640186E0E030E3A0B3B0B49133F190000032400030E3E0B0B0B000000470000000400000000000801340000002C00040000000000000070040100000000003000000002700401000000000030000000016D06000000060000000101430000000300000000050400696E74002D006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F663332004952454500400000000200000000004B000000260000006D756C7469706C655F726573756C74735F64697370617463685F305F656C656D656E74776973655F325F6633320000000000160000000200000000004B00000043000000696E74000000000038000000040019000000010101FB0E0D000101010100000001000001002D000000000000090270040100000000000105010A82060B08E4020800010149524545000000000000000000000000000000000000000000000000000000000000002300000000020900A00502000000000000000000000000000100000012000700A0040100000000001400000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E65685F6672616D65002E74657874002E646174612E72656C2E726F002E64796E616D6963002E72656C726F5F70616464696E67002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F000000030000000200000000000000100200000000000010020000000000002300000000000000000000000000000001000000000000000000000000000000170000000400000002000000000000003802000000000000380200000000000020010000000000000100000000000000080000000000000018000000000000002100000001000000020000000000000060030000000000006003000000000000C800000000000000000000000000000010000000000000000000000000000000290000000100000002000000000000002804000000000000280400000000000048000000000000000000000000000000080000000000000000000000000000003300000001000000060000000000000070040100000000007004000000000000440000000000000000000000000000000400000000000000000000000000000039000000010000000300000000000000C004020000000000C004000000000000E00000000000000000000000000000001000000000000000000000000000000046000000060000000300000000000000A005020000000000A005000000000000C0000000000000000300000000000000080000000000000010000000000000004F00000008000000030000000000000060060200000000006006000000000000A0090000000000000000000000000000010000000000000000000000000000005E0000000100000000000000000000000000000000000000600600000000000037000000000000000000000000000000010000000000000000000000000000006C000000010000000000000000000000000000000000000097060000000000004B00000000000000000000000000000001000000000000000000000000000000780000000100000030000000000000000000000000000000E20600000000000039000000000000000000000000000000010000000000000001000000000000008300000001000000000000000000000000000000000000001B0700000000000044000000000000000000000000000000010000000000000000000000000000009300000001000000000000000000000000000000000000005F070000000000001A00000000000000000000000000000001000000000000000000000000000000A3000000010000000000000000000000000000000000000079070000000000003C00000000000000000000000000000001000000000000000000000000000000AF0000000100000030000000000000000000000000000000B5070000000000000500000000000000000000000000000001000000000000000100000000000000B80000000200000000000000000000000000000000000000C0070000000000004800000000000000140000000200000008000000000000001800000000000000C000000003000000000000000000000000000000000000000808000000000000D200000000000000000000000000000001000000000000000000000000000000CA0000000300000000000000000000000000000000000000DA080000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3656xi8> + vm.func private @__multiple_results_memoize_apply() -> !vm.ref attributes {inlining_policy = #util.inline.never, vm.unwind} { + %c13 = vm.const.i32 13 + %c28 = vm.const.i32 28 + %c2 = vm.const.i32 2 + %null = vm.const.ref.zero : !vm.ref + %c1 = vm.const.i32 1 + %c3 = vm.const.i32 3 + %c64 = vm.const.i32 64 + %c128 = vm.const.i64 128 + %c8 = vm.const.i64 8 + %zero = vm.const.i64.zero + %zero_0 = vm.const.i32.zero + %c-1 = vm.const.i64 -1 + %__device_0 = vm.global.load.ref @__device_0 : !vm.ref + %__device_0_executable_0_multiple_results_dispatch_0 = vm.global.load.ref @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref + %ref = vm.call @hal.command_buffer.create(%__device_0, %zero_0, %c3, %c-1, %c3) : (!vm.ref, i32, i32, i64, i32) -> !vm.ref + vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%zero_0], [(%zero_0, %zero_0, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64, i32 ..., tuple, i64, i64> ...) + vm.call.variadic @hal.command_buffer.dispatch(%ref, %__device_0_executable_0_multiple_results_dispatch_0, %zero_0, %c1, %c1, %c1, %zero, [%c64], [(%zero_0, %c1, %null, %zero, %c8), (%zero_0, %c2, %null, %zero, %c128)]) : (!vm.ref, !vm.ref, i32, i32, i32, i32, i64, i32 ..., tuple, i64, i64> ...) + vm.call @hal.command_buffer.execution_barrier(%ref, %c28, %c13, %zero) : (!vm.ref, i32, i32, i64) -> () + vm.call @hal.command_buffer.finalize(%ref) : (!vm.ref) -> () + vm.return %ref : !vm.ref + } + vm.import private @hal.buffer.assert(%buffer : !vm.ref, %message : !vm.buffer, %allocator : !vm.ref, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) + vm.import private @hal.buffer_view.create(%buffer : !vm.ref, %source_offset : i64, %source_length : i64, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.buffer_view.assert(%buffer_view : !vm.ref, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) + vm.import private @hal.buffer_view.buffer(%buffer_view : !vm.ref) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.command_buffer.create(%device : !vm.ref, %modes : i32, %command_categories : i32, %queue_affinity : i64, %binding_capacity : i32) -> !vm.ref attributes {minimum_version = 6 : i32} + vm.import private @hal.command_buffer.finalize(%command_buffer : !vm.ref) + vm.import private @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i64) + vm.import private @hal.command_buffer.dispatch(%command_buffer : !vm.ref, %executable : !vm.ref, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32, %flags : i64, %constants : i32 ..., %bindings : tuple, i64, i64> ...) + vm.import private @hal.device.allocator(%device : !vm.ref) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.device.query.i64(%device : !vm.ref, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i64) attributes {nosideeffects} + vm.import private @hal.device.queue.alloca(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %pool : i64, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64, %flags : i64) -> !vm.ref + vm.import private @hal.device.queue.execute.indirect(%device : !vm.ref, %queue_affinity : i64, %wait_fence : !vm.ref, %signal_fence : !vm.ref, %command_buffer : !vm.ref, %flags : i64, %binding_table : tuple, i64, i64> ...) + vm.import private @hal.devices.count() -> i32 attributes {nosideeffects} + vm.import private @hal.devices.get(%index : i32) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.executable.create(%device : !vm.ref, %queue_affinity : i64, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer) -> !vm.ref attributes {nosideeffects} + vm.import private @hal.fence.create(%device : !vm.ref, %flags : i64) -> !vm.ref + vm.import private @hal.fence.await(%timeout_millis : i32, %flags : i64, %fences : !vm.ref ...) -> i32 attributes {vm.yield} + vm.rodata private @_utf8_input0_DCE99660CEB3F6B {alignment = 1 : i64} "input0" + vm.rodata private @_utf8_tensor_FC1814BC4A58F22A {alignment = 1 : i64} "tensor" + vm.rodata private @_utf8_input1_B898B726583C85DA {alignment = 1 : i64} "input1" + vm.func private @multiple_results(%arg0: !vm.ref, %arg1: !vm.ref) -> (!vm.ref, !vm.ref) attributes {iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}, vm.unwind, vm.yield} { + %c16 = vm.const.i32 16 + %c1 = vm.const.i32 1 + %c553648160 = vm.const.i32 553648160 + %c3075 = vm.const.i32 3075 + %c48 = vm.const.i32 48 + %c2 = vm.const.i64 2 + %c8 = vm.const.i64 8 + %c64 = vm.const.i64 64 + %c128 = vm.const.i64 128 + %zero = vm.const.i64.zero + %c-1 = vm.const.i64 -1 + %null = vm.const.ref.zero : !vm.ref + %c-1_0 = vm.const.i32 -1 + %__device_0 = vm.global.load.ref @__device_0 : !vm.ref + %__multiple_results_memoize_result_0_device_0 = vm.global.load.ref @__multiple_results_memoize_result_0_device_0 : !vm.ref + %_utf8_input0_DCE99660CEB3F6B = vm.const.ref.rodata @_utf8_input0_DCE99660CEB3F6B : !vm.buffer + vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_input0_DCE99660CEB3F6B, %c553648160, %c1, [%c2]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) + %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref) -> !vm.ref + %ref_1 = vm.call @hal.device.allocator(%__device_0) {nosideeffects} : (!vm.ref) -> !vm.ref + %_utf8_tensor_FC1814BC4A58F22A = vm.const.ref.rodata @_utf8_tensor_FC1814BC4A58F22A : !vm.buffer + vm.call @hal.buffer.assert(%ref, %_utf8_tensor_FC1814BC4A58F22A, %ref_1, %c8, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () + %_utf8_input1_B898B726583C85DA = vm.const.ref.rodata @_utf8_input1_B898B726583C85DA : !vm.buffer + vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_input1_B898B726583C85DA, %c553648160, %c1, [%c2]) : (!vm.ref, !vm.buffer, i32, i32, i64 ...) + %ref_2 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref) -> !vm.ref + vm.call @hal.buffer.assert(%ref_2, %_utf8_tensor_FC1814BC4A58F22A, %ref_1, %c8, %c16, %c3075) : (!vm.ref, !vm.buffer, !vm.ref, i64, i32, i32) -> () + %ref_3 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i64) -> !vm.ref + %ref_4 = vm.call @hal.device.queue.alloca(%__device_0, %c-1, %null, %ref_3, %zero, %c48, %c3075, %c128, %zero) : (!vm.ref, i64, !vm.ref, !vm.ref, i64, i32, i32, i64, i64) -> !vm.ref + %ref_5 = vm.call @hal.fence.create(%__device_0, %zero) : (!vm.ref, i64) -> !vm.ref + vm.call.variadic @hal.device.queue.execute.indirect(%__device_0, %c-1, %ref_3, %ref_5, %__multiple_results_memoize_result_0_device_0, %zero, [(%ref, %zero, %c8), (%ref_2, %zero, %c8), (%ref_4, %zero, %c128)]) : (!vm.ref, i64, !vm.ref, !vm.ref, !vm.ref, i64, tuple, i64, i64> ...) + vm.call.variadic.yieldable @hal.fence.await(%c-1_0, %zero, %ref_5) {segment_sizes = dense<[-1, -1, 1]> : vector<3xi16>, segment_types = [i32, i64, !vm.ref]} : (i32, i64, !vm.ref) -> ^bb1 (i32) + ^bb1(%0: i32): // pred: ^bb0 + vm.cond_br %0, ^bb3, ^bb2 + ^bb2: // pred: ^bb1 + %ref_6 = vm.call.variadic @hal.buffer_view.create(%ref_4, %zero, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref + %ref_7 = vm.call.variadic @hal.buffer_view.create(%ref_4, %c64, %c8, %c553648160, %c1, [%c2]) {nosideeffects} : (!vm.ref, i64, i64, i32, i32, i64 ...) -> !vm.ref + vm.return %ref_6, %ref_7 : !vm.ref, !vm.ref + ^bb3: // pred: ^bb1 + vm.discard.refs %ref_4 : !vm.ref + vm.fail %0, "failed to wait on timepoint" + } + vm.export @multiple_results attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @multiple_results(%input0: tensor<2xf32>, %input1: tensor<2xf32>) -> (%output0: tensor<2xf32>, %output1: tensor<2xf32>)"}} + vm.export @__init + vm.func private @__init() attributes {vm.unwind} { + %c1 = vm.const.i32 1 + %null = vm.const.ref.zero : !vm.buffer + %c14 = vm.const.i32 14 + %c-1 = vm.const.i64 -1 + %c18 = vm.const.i32 18 + %zero = vm.const.i32.zero + %zero_0 = vm.const.i64.zero + %c1_1 = vm.const.i64 1 + %null_2 = vm.const.ref.zero : !vm.ref + %0 = vm.call @hal.devices.count() {nosideeffects} : () -> i32 + %1 = vm.ext.i32.i64.s %0 : i32 -> i64 + vm.br ^bb1(%zero_0, %zero_0, %null_2 : i64, i64, !vm.ref) + ^bb1(%2: i64, %3: i64, %4: !vm.ref): // 2 preds: ^bb0, ^bb4 + %rnz = vm.cmp.nz.ref %4 : !vm.ref + %5 = vm.xor.i32 %rnz, %c1 : i32 + %slt = vm.cmp.lt.i64.s %2, %1 : i64 + %6 = vm.and.i32 %5, %slt : i32 + vm.cond_br %6, ^bb2, ^bb5 + ^bb2: // pred: ^bb1 + vm.discard.refs %4 : !vm.ref + %7 = vm.trunc.i64.i32 %2 : i64 -> i32 + %ref = vm.call @hal.devices.get(%7) {nosideeffects} : (i32) -> !vm.ref + %_utf8_hal_device_id_C6650FF277232B5A = vm.const.ref.rodata @_utf8_hal_device_id_C6650FF277232B5A : !vm.buffer + %_utf8_local_1A8FF0278D7661D8 = vm.const.ref.rodata @_utf8_local_1A8FF0278D7661D8 : !vm.buffer + %8:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_device_id_C6650FF277232B5A, %_utf8_local_1A8FF0278D7661D8) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) + %nz = vm.cmp.nz.i64 %8#1 : i64 + %9 = vm.select.i32 %8#0, %nz, %zero : i32 + vm.cond_br %9, ^bb3, ^bb4(%zero : i32) + ^bb3: // pred: ^bb2 + %_utf8_hal_executable_format_E03EECB63A2AAF52 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer + %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer + %10:2 = vm.call @hal.device.query.i64(%ref, %_utf8_hal_executable_format_E03EECB63A2AAF52, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) + %nz_3 = vm.cmp.nz.i64 %10#1 : i64 + %11 = vm.select.i32 %10#0, %nz_3, %zero : i32 + vm.br ^bb4(%11 : i32) + ^bb4(%12: i32): // 2 preds: ^bb2, ^bb3 + %eq = vm.cmp.eq.i64 %3, %zero_0 : i64 + %13 = vm.select.i64 %12, %c1_1, %zero_0 : i64 + %14 = vm.add.i64 %3, %13 : i64 + %15 = vm.and.i32 %12, %eq : i32 + %ref_4 = vm.select.ref %15, %ref, %null_2 : !vm.ref + %16 = vm.add.i64 %2, %c1_1 : i64 + vm.br ^bb1(%16, %14, %ref_4 : i64, i64, !vm.ref) + ^bb5: // pred: ^bb1 + vm.discard.refs %null_2 : !vm.ref + vm.cond_br %5, ^bb6, ^bb7 + ^bb6: // pred: ^bb5 + vm.discard.refs %null, %4 : !vm.buffer, !vm.ref + vm.fail %c18, "HAL device `__device_0` not found or unavailable: #hal.device.target<\22local\22, [#hal.executable.target<\22llvm-cpu\22, \22embedded-elf-arm_64\22, {cpu = \22\22, cpu_features = \22+reserve-x18\22, data_layout = \22e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32\22, iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = \22arm64-unknown-unknown-eabi-elf\22}>]>" + ^bb7: // pred: ^bb5 + %_utf8_hal_executable_format_E03EECB63A2AAF52_5 = vm.const.ref.rodata @_utf8_hal_executable_format_E03EECB63A2AAF52 : !vm.buffer + %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6 = vm.const.ref.rodata @_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5 : !vm.buffer + %17:2 = vm.call @hal.device.query.i64(%4, %_utf8_hal_executable_format_E03EECB63A2AAF52_5, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6) {nosideeffects} : (!vm.ref, !vm.buffer, !vm.buffer) -> (i32, i64) + %nz_7 = vm.cmp.nz.i64 %17#1 : i64 + %18 = vm.select.i32 %17#0, %nz_7, %zero : i32 + %19 = vm.select.i64 %18, %zero_0, %c-1 : i64 + %eq_8 = vm.cmp.eq.i64 %19, %zero_0 : i64 + vm.global.store.ref %4, @__device_0 : !vm.ref + vm.cond_br %eq_8, ^bb8, ^bb9 + ^bb8: // pred: ^bb7 + %multiple_results_dispatch_0_embedded_elf_arm_64 = vm.const.ref.rodata @multiple_results_dispatch_0_embedded_elf_arm_64 : !vm.buffer + %ref_9 = vm.call @hal.executable.create(%4, %c-1, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6, %multiple_results_dispatch_0_embedded_elf_arm_64, %null) {nosideeffects} : (!vm.ref, i64, !vm.buffer, !vm.buffer, !vm.buffer) -> !vm.ref + vm.global.store.ref %ref_9, @__device_0_executable_0_multiple_results_dispatch_0 : !vm.ref + %ref_10 = vm.call @__multiple_results_memoize_apply() : () -> !vm.ref + vm.global.store.ref %ref_10, @__multiple_results_memoize_result_0_device_0 : !vm.ref + vm.return + ^bb9: // pred: ^bb7 + vm.discard.refs %null, %4, %_utf8_embedded_elf_arm_64_87C64FBD3BCE06F5_6 : !vm.buffer, !vm.ref, !vm.buffer + vm.fail %c14, "HAL device `__device_0` does not support any variant of executable `multiple_results_dispatch_0`; available formats: [embedded-elf-arm_64]" + } +} + From 9fb4c614de7d6617882d7bb9dc7dc2e12e540520 Mon Sep 17 00:00:00 2001 From: sparsh Date: Fri, 16 Jan 2026 20:41:56 -0800 Subject: [PATCH 11/13] IREE: baseline attempt to compile Gemmini tile MLIR (expected fail log) --- .../mini_cnn_block.gemmini_tile.mlir | 191 ++++++++++++++++++ .../logs/mini_cnn_gemmini_tile.iree_fail.txt | 4 + 2 files changed, 195 insertions(+) create mode 100644 experiments/gemmini/iree_inputs/mini_cnn_block.gemmini_tile.mlir create mode 100644 experiments/iree/logs/mini_cnn_gemmini_tile.iree_fail.txt diff --git a/experiments/gemmini/iree_inputs/mini_cnn_block.gemmini_tile.mlir b/experiments/gemmini/iree_inputs/mini_cnn_block.gemmini_tile.mlir new file mode 100644 index 0000000..3a03623 --- /dev/null +++ b/experiments/gemmini/iree_inputs/mini_cnn_block.gemmini_tile.mlir @@ -0,0 +1,191 @@ +module { + func.func @mini_cnn_block(%arg0: memref<1x3x32x32xf32>, %arg1: memref<16x3x3x3xf32>, %arg2: memref<32x16x3x3xf32>, %arg3: memref<1x32x26x26xf32>) { + %alloc = memref.alloc() : memref<1x16x30x30xf32> + %alloc_0 = memref.alloc() : memref<1x32x26x26xf32> + %alloc_1 = memref.alloc() : memref<1x32x32x3xf32> + %alloc_2 = memref.alloc() : memref<27x16xf32> + %alloc_3 = memref.alloc() : memref<16xi32> + %alloc_4 = memref.alloc() : memref<900x16xf32> + %c30_i64 = arith.constant 30 : i64 + %c3 = arith.constant 3 : index + %c3_5 = arith.constant 3 : index + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c1_6 = arith.constant 1 : index + scf.for %arg4 = %c0 to %c1 step %c1_6 { + %c0_27 = arith.constant 0 : index + %c3_28 = arith.constant 3 : index + %c1_29 = arith.constant 1 : index + scf.for %arg5 = %c0_27 to %c3_28 step %c1_29 { + %c0_30 = arith.constant 0 : index + %c32_31 = arith.constant 32 : index + %c1_32 = arith.constant 1 : index + scf.for %arg6 = %c0_30 to %c32_31 step %c1_32 { + %c0_33 = arith.constant 0 : index + %c32_34 = arith.constant 32 : index + %c1_35 = arith.constant 1 : index + scf.for %arg7 = %c0_33 to %c32_34 step %c1_35 { + %0 = memref.load %arg0[%arg4, %arg5, %arg6, %arg7] : memref<1x3x32x32xf32> + memref.store %0, %alloc_1[%arg4, %arg6, %arg7, %arg5] : memref<1x32x32x3xf32> + } + } + } + } + %c0_7 = arith.constant 0 : index + %c16 = arith.constant 16 : index + %c1_8 = arith.constant 1 : index + scf.for %arg4 = %c0_7 to %c16 step %c1_8 { + %c0_27 = arith.constant 0 : index + %c3_28 = arith.constant 3 : index + %c1_29 = arith.constant 1 : index + scf.for %arg5 = %c0_27 to %c3_28 step %c1_29 { + %c0_30 = arith.constant 0 : index + %c3_31 = arith.constant 3 : index + %c1_32 = arith.constant 1 : index + scf.for %arg6 = %c0_30 to %c3_31 step %c1_32 { + %c0_33 = arith.constant 0 : index + %c3_34 = arith.constant 3 : index + %c1_35 = arith.constant 1 : index + scf.for %arg7 = %c0_33 to %c3_34 step %c1_35 { + %0 = arith.muli %arg6, %c3 : index + %1 = arith.muli %0, %c3_5 : index + %2 = arith.muli %arg7, %c3_5 : index + %3 = arith.addi %1, %2 : index + %4 = arith.addi %3, %arg5 : index + %5 = memref.load %arg1[%arg4, %arg5, %arg6, %arg7] : memref<16x3x3x3xf32> + memref.store %5, %alloc_2[%4, %arg4] : memref<27x16xf32> + } + } + } + } + %c3_i64 = arith.constant 3 : i64 + gemmini.tile_conv %alloc_1 %alloc_2 %alloc_3 %alloc_4 %c30_i64 %c30_i64 %c3_i64 : memref<1x32x32x3xf32> memref<27x16xf32> memref<16xi32> memref<900x16xf32> i64 i64 i64 + %c0_9 = arith.constant 0 : index + %c1_10 = arith.constant 1 : index + %c1_11 = arith.constant 1 : index + scf.for %arg4 = %c0_9 to %c1_10 step %c1_11 { + %c0_27 = arith.constant 0 : index + %c16_28 = arith.constant 16 : index + %c1_29 = arith.constant 1 : index + scf.for %arg5 = %c0_27 to %c16_28 step %c1_29 { + %c0_30 = arith.constant 0 : index + %c30 = arith.constant 30 : index + %c1_31 = arith.constant 1 : index + scf.for %arg6 = %c0_30 to %c30 step %c1_31 { + %c0_32 = arith.constant 0 : index + %c30_33 = arith.constant 30 : index + %c1_34 = arith.constant 1 : index + scf.for %arg7 = %c0_32 to %c30_33 step %c1_34 { + %c30_35 = arith.constant 30 : index + %0 = arith.muli %arg4, %c30_35 : index + %1 = arith.muli %0, %c30_35 : index + %2 = arith.muli %arg6, %c30_35 : index + %3 = arith.addi %1, %2 : index + %4 = arith.addi %3, %arg7 : index + %5 = memref.load %alloc_4[%4, %arg5] : memref<900x16xf32> + memref.store %5, %alloc[%arg4, %arg5, %arg6, %arg7] : memref<1x16x30x30xf32> + } + } + } + } + memref.dealloc %alloc_1 : memref<1x32x32x3xf32> + memref.dealloc %alloc_2 : memref<27x16xf32> + memref.dealloc %alloc_4 : memref<900x16xf32> + memref.dealloc %alloc_3 : memref<16xi32> + %alloc_12 = memref.alloc() : memref<1x30x30x16xf32> + %alloc_13 = memref.alloc() : memref<144x32xf32> + %alloc_14 = memref.alloc() : memref<32xi32> + %alloc_15 = memref.alloc() : memref<676x32xf32> + %c26_i64 = arith.constant 26 : i64 + %c3_16 = arith.constant 3 : index + %c16_17 = arith.constant 16 : index + %c0_18 = arith.constant 0 : index + %c1_19 = arith.constant 1 : index + %c1_20 = arith.constant 1 : index + scf.for %arg4 = %c0_18 to %c1_19 step %c1_20 { + %c0_27 = arith.constant 0 : index + %c16_28 = arith.constant 16 : index + %c1_29 = arith.constant 1 : index + scf.for %arg5 = %c0_27 to %c16_28 step %c1_29 { + %c0_30 = arith.constant 0 : index + %c30 = arith.constant 30 : index + %c1_31 = arith.constant 1 : index + scf.for %arg6 = %c0_30 to %c30 step %c1_31 { + %c0_32 = arith.constant 0 : index + %c30_33 = arith.constant 30 : index + %c1_34 = arith.constant 1 : index + scf.for %arg7 = %c0_32 to %c30_33 step %c1_34 { + %0 = memref.load %alloc[%arg4, %arg5, %arg6, %arg7] : memref<1x16x30x30xf32> + memref.store %0, %alloc_12[%arg4, %arg6, %arg7, %arg5] : memref<1x30x30x16xf32> + } + } + } + } + %c0_21 = arith.constant 0 : index + %c32 = arith.constant 32 : index + %c1_22 = arith.constant 1 : index + scf.for %arg4 = %c0_21 to %c32 step %c1_22 { + %c0_27 = arith.constant 0 : index + %c16_28 = arith.constant 16 : index + %c1_29 = arith.constant 1 : index + scf.for %arg5 = %c0_27 to %c16_28 step %c1_29 { + %c0_30 = arith.constant 0 : index + %c3_31 = arith.constant 3 : index + %c1_32 = arith.constant 1 : index + scf.for %arg6 = %c0_30 to %c3_31 step %c1_32 { + %c0_33 = arith.constant 0 : index + %c3_34 = arith.constant 3 : index + %c1_35 = arith.constant 1 : index + scf.for %arg7 = %c0_33 to %c3_34 step %c1_35 { + %0 = arith.muli %arg6, %c3_16 : index + %1 = arith.muli %0, %c16_17 : index + %2 = arith.muli %arg7, %c16_17 : index + %3 = arith.addi %1, %2 : index + %4 = arith.addi %3, %arg5 : index + %5 = memref.load %arg2[%arg4, %arg5, %arg6, %arg7] : memref<32x16x3x3xf32> + memref.store %5, %alloc_13[%4, %arg4] : memref<144x32xf32> + } + } + } + } + %c3_i64_23 = arith.constant 3 : i64 + gemmini.tile_conv %alloc_12 %alloc_13 %alloc_14 %alloc_15 %c26_i64 %c26_i64 %c3_i64_23 : memref<1x30x30x16xf32> memref<144x32xf32> memref<32xi32> memref<676x32xf32> i64 i64 i64 + %c0_24 = arith.constant 0 : index + %c1_25 = arith.constant 1 : index + %c1_26 = arith.constant 1 : index + scf.for %arg4 = %c0_24 to %c1_25 step %c1_26 { + %c0_27 = arith.constant 0 : index + %c32_28 = arith.constant 32 : index + %c1_29 = arith.constant 1 : index + scf.for %arg5 = %c0_27 to %c32_28 step %c1_29 { + %c0_30 = arith.constant 0 : index + %c26 = arith.constant 26 : index + %c1_31 = arith.constant 1 : index + scf.for %arg6 = %c0_30 to %c26 step %c1_31 { + %c0_32 = arith.constant 0 : index + %c26_33 = arith.constant 26 : index + %c1_34 = arith.constant 1 : index + scf.for %arg7 = %c0_32 to %c26_33 step %c1_34 { + %c26_35 = arith.constant 26 : index + %0 = arith.muli %arg4, %c26_35 : index + %1 = arith.muli %0, %c26_35 : index + %2 = arith.muli %arg6, %c26_35 : index + %3 = arith.addi %1, %2 : index + %4 = arith.addi %3, %arg7 : index + %5 = memref.load %alloc_15[%4, %arg5] : memref<676x32xf32> + memref.store %5, %alloc_0[%arg4, %arg5, %arg6, %arg7] : memref<1x32x26x26xf32> + } + } + } + } + memref.dealloc %alloc_12 : memref<1x30x30x16xf32> + memref.dealloc %alloc_13 : memref<144x32xf32> + memref.dealloc %alloc_15 : memref<676x32xf32> + memref.dealloc %alloc_14 : memref<32xi32> + linalg.copy ins(%alloc_0 : memref<1x32x26x26xf32>) outs(%arg3 : memref<1x32x26x26xf32>) + memref.dealloc %alloc : memref<1x16x30x30xf32> + memref.dealloc %alloc_0 : memref<1x32x26x26xf32> + return + } +} + diff --git a/experiments/iree/logs/mini_cnn_gemmini_tile.iree_fail.txt b/experiments/iree/logs/mini_cnn_gemmini_tile.iree_fail.txt new file mode 100644 index 0000000..d1d1619 --- /dev/null +++ b/experiments/iree/logs/mini_cnn_gemmini_tile.iree_fail.txt @@ -0,0 +1,4 @@ +/Users/sparshsingh/work/merlin/experiments/gemmini/iree_inputs/mini_cnn_block.gemmini_tile.mlir:62:5: error: Dialect `gemmini' not found for custom op 'gemmini.tile_conv' + gemmini.tile_conv %alloc_1 %alloc_2 %alloc_3 %alloc_4 %c30_i64 %c30_i64 %c3_i64 : memref<1x32x32x3xf32> memref<27x16xf32> memref<16xi32> memref<900x16xf32> i64 i64 i64 + ^ +/Users/sparshsingh/work/merlin/experiments/gemmini/iree_inputs/mini_cnn_block.gemmini_tile.mlir:62:5: note: Available dialects: affine, amdgpu, arith, arm_neon, arm_sme, arm_sve, bufferization, builtin, cf, check, chlo, complex, emitc, flow, func, gpu, hal, hal_inline, hal_loader, io_parameters, iree_codegen, iree_cpu, iree_encoding, iree_gpu, iree_linalg_ext, iree_tensor_ext, iree_vector_ext, linalg, llvm, math, memref, ml_program, nvgpu, nvvm, pcf, pdl, pdl_interp, quant, rocdl, scf, shape, shard, spirv, stablehlo, stream, tensor, tm_tensor, torch, torch_c, tosa, transform, ub, util, vector, vhlo, vm, vmvx ; for more info on dialect registration see https://mlir.llvm.org/getting_started/Faq/#registered-loaded-dependent-whats-up-with-dialects-management From f0a03f6766ae93ea740076edd9cd13af7f12d975 Mon Sep 17 00:00:00 2001 From: sparsh Date: Fri, 16 Jan 2026 20:47:12 -0800 Subject: [PATCH 12/13] IREE: baseline attempt to compile Gemmini tile MLIR (expected fail log) --- experiments/iree/logs/mini_cnn_gemmini_tile.iree_fail.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/experiments/iree/logs/mini_cnn_gemmini_tile.iree_fail.txt b/experiments/iree/logs/mini_cnn_gemmini_tile.iree_fail.txt index d1d1619..5193e6f 100644 --- a/experiments/iree/logs/mini_cnn_gemmini_tile.iree_fail.txt +++ b/experiments/iree/logs/mini_cnn_gemmini_tile.iree_fail.txt @@ -1,4 +1,4 @@ -/Users/sparshsingh/work/merlin/experiments/gemmini/iree_inputs/mini_cnn_block.gemmini_tile.mlir:62:5: error: Dialect `gemmini' not found for custom op 'gemmini.tile_conv' +experiments/gemmini/iree_inputs/mini_cnn_block.gemmini_tile.mlir:62:5: error: Dialect `gemmini' not found for custom op 'gemmini.tile_conv' gemmini.tile_conv %alloc_1 %alloc_2 %alloc_3 %alloc_4 %c30_i64 %c30_i64 %c3_i64 : memref<1x32x32x3xf32> memref<27x16xf32> memref<16xi32> memref<900x16xf32> i64 i64 i64 ^ -/Users/sparshsingh/work/merlin/experiments/gemmini/iree_inputs/mini_cnn_block.gemmini_tile.mlir:62:5: note: Available dialects: affine, amdgpu, arith, arm_neon, arm_sme, arm_sve, bufferization, builtin, cf, check, chlo, complex, emitc, flow, func, gpu, hal, hal_inline, hal_loader, io_parameters, iree_codegen, iree_cpu, iree_encoding, iree_gpu, iree_linalg_ext, iree_tensor_ext, iree_vector_ext, linalg, llvm, math, memref, ml_program, nvgpu, nvvm, pcf, pdl, pdl_interp, quant, rocdl, scf, shape, shard, spirv, stablehlo, stream, tensor, tm_tensor, torch, torch_c, tosa, transform, ub, util, vector, vhlo, vm, vmvx ; for more info on dialect registration see https://mlir.llvm.org/getting_started/Faq/#registered-loaded-dependent-whats-up-with-dialects-management +experiments/gemmini/iree_inputs/mini_cnn_block.gemmini_tile.mlir:62:5: note: Available dialects: affine, amdgpu, arith, arm_neon, arm_sme, arm_sve, bufferization, builtin, cf, check, chlo, complex, emitc, flow, func, gpu, hal, hal_inline, hal_loader, io_parameters, iree_codegen, iree_cpu, iree_encoding, iree_gpu, iree_linalg_ext, iree_tensor_ext, iree_vector_ext, linalg, llvm, math, memref, ml_program, nvgpu, nvvm, pcf, pdl, pdl_interp, quant, rocdl, scf, shape, shard, spirv, stablehlo, stream, tensor, tm_tensor, torch, torch_c, tosa, transform, ub, util, vector, vhlo, vm, vmvx ; for more info on dialect registration see https://mlir.llvm.org/getting_started/Faq/#registered-loaded-dependent-whats-up-with-dialects-management From f2b44f76594654d08931c6a470e2e96ca85e22cf Mon Sep 17 00:00:00 2001 From: sparsh Date: Fri, 16 Jan 2026 20:47:19 -0800 Subject: [PATCH 13/13] Gemmini: add demo IR dump log for conv2d_block1 --- .../conv2d_block1.print-after-all.demo.mlir | 693 ++++++++++++++++++ 1 file changed, 693 insertions(+) create mode 100644 experiments/gemmini/logs/conv2d_block1.print-after-all.demo.mlir diff --git a/experiments/gemmini/logs/conv2d_block1.print-after-all.demo.mlir b/experiments/gemmini/logs/conv2d_block1.print-after-all.demo.mlir new file mode 100644 index 0000000..e22f289 --- /dev/null +++ b/experiments/gemmini/logs/conv2d_block1.print-after-all.demo.mlir @@ -0,0 +1,693 @@ +// -----// IR Dump After (anonymous namespace)::LowerLinalgToGemminiPass (convert-linalg-to-gemmini) //----- // +module { + func.func @conv2d_block1(%arg0: memref<1x32x32x32xf16>, %arg1: memref<3x3x32x64xf16>, %arg2: memref<1x30x30x64xf32>) { + %alloc = memref.alloc() : memref<288x64xf16> + %alloc_0 = memref.alloc() : memref<900x64xf32> + %alloc_1 = memref.alloc() : memref<64xi32> + %c0_i32 = arith.constant 0 : i32 + linalg.fill ins(%c0_i32 : i32) outs(%alloc_1 : memref<64xi32>) + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c3 = arith.constant 3 : index + scf.for %arg3 = %c0 to %c3 step %c1 { + %c3_3 = arith.constant 3 : index + scf.for %arg4 = %c0 to %c3_3 step %c1 { + %c32 = arith.constant 32 : index + scf.for %arg5 = %c0 to %c32 step %c1 { + %c64 = arith.constant 64 : index + scf.for %arg6 = %c0 to %c64 step %c1 { + %c3_4 = arith.constant 3 : index + %c32_5 = arith.constant 32 : index + %0 = arith.muli %arg3, %c3_4 : index + %1 = arith.muli %0, %c32_5 : index + %2 = arith.muli %arg4, %c32_5 : index + %3 = arith.addi %1, %2 : index + %4 = arith.addi %3, %arg5 : index + %5 = memref.load %arg1[%arg3, %arg4, %arg5, %arg6] : memref<3x3x32x64xf16> + memref.store %5, %alloc[%4, %arg6] : memref<288x64xf16> + } + } + } + } + %c30_i64 = arith.constant 30 : i64 + %c3_i64 = arith.constant 3 : i64 + gemmini.tile_conv %arg0 %alloc %alloc_1 %alloc_0 %c30_i64 %c30_i64 %c3_i64 : memref<1x32x32x32xf16> memref<288x64xf16> memref<64xi32> memref<900x64xf32> i64 i64 i64 + %c1_2 = arith.constant 1 : index + scf.for %arg3 = %c0 to %c1_2 step %c1 { + %c30 = arith.constant 30 : index + scf.for %arg4 = %c0 to %c30 step %c1 { + %c30_3 = arith.constant 30 : index + scf.for %arg5 = %c0 to %c30_3 step %c1 { + %c64 = arith.constant 64 : index + scf.for %arg6 = %c0 to %c64 step %c1 { + %c30_4 = arith.constant 30 : index + %0 = arith.muli %arg3, %c30_4 : index + %1 = arith.muli %0, %c30_4 : index + %2 = arith.muli %c30_4, %arg4 : index + %3 = arith.addi %1, %2 : index + %4 = arith.addi %3, %arg5 : index + %5 = memref.load %alloc_0[%4, %arg6] : memref<900x64xf32> + memref.store %5, %arg2[%arg3, %arg4, %arg5, %arg6] : memref<1x30x30x64xf32> + } + } + } + } + memref.dealloc %alloc : memref<288x64xf16> + memref.dealloc %alloc_0 : memref<900x64xf32> + memref.dealloc %alloc_1 : memref<64xi32> + return + } +} + + +// -----// IR Dump After (anonymous namespace)::LowerGemminiToLLVMPass (lower-gemmini) //----- // +module { + llvm.func @free(!llvm.ptr) + llvm.func @malloc(i64) -> !llvm.ptr + llvm.func @conv2d_block1(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr, %arg23: !llvm.ptr, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64) { + %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1 = llvm.insertvalue %arg22, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %2 = llvm.insertvalue %arg23, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %3 = llvm.insertvalue %arg24, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %4 = llvm.insertvalue %arg25, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %5 = llvm.insertvalue %arg29, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %6 = llvm.insertvalue %arg26, %5[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %7 = llvm.insertvalue %arg30, %6[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %8 = llvm.insertvalue %arg27, %7[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %9 = llvm.insertvalue %arg31, %8[4, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %10 = llvm.insertvalue %arg28, %9[3, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %11 = llvm.insertvalue %arg32, %10[4, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %12 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %13 = llvm.insertvalue %arg0, %12[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %14 = llvm.insertvalue %arg1, %13[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %15 = llvm.insertvalue %arg2, %14[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %16 = llvm.insertvalue %arg3, %15[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %17 = llvm.insertvalue %arg7, %16[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %18 = llvm.insertvalue %arg4, %17[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %19 = llvm.insertvalue %arg8, %18[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %20 = llvm.insertvalue %arg5, %19[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %21 = llvm.insertvalue %arg9, %20[4, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %22 = llvm.insertvalue %arg6, %21[3, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %23 = llvm.insertvalue %arg10, %22[4, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %24 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %25 = llvm.insertvalue %arg11, %24[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %26 = llvm.insertvalue %arg12, %25[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %27 = llvm.insertvalue %arg13, %26[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %28 = llvm.insertvalue %arg14, %27[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %29 = llvm.insertvalue %arg18, %28[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %30 = llvm.insertvalue %arg15, %29[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %31 = llvm.insertvalue %arg19, %30[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %32 = llvm.insertvalue %arg16, %31[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %33 = llvm.insertvalue %arg20, %32[4, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %34 = llvm.insertvalue %arg17, %33[3, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %35 = llvm.insertvalue %arg21, %34[4, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %36 = llvm.mlir.constant(288 : index) : i64 + %37 = llvm.mlir.constant(64 : index) : i64 + %38 = llvm.mlir.constant(1 : index) : i64 + %39 = llvm.mlir.constant(18432 : index) : i64 + %40 = llvm.mlir.zero : !llvm.ptr + %41 = llvm.getelementptr %40[%39] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %42 = llvm.ptrtoint %41 : !llvm.ptr to i64 + %43 = llvm.call @malloc(%42) : (i64) -> !llvm.ptr + %44 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %45 = llvm.insertvalue %43, %44[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %46 = llvm.insertvalue %43, %45[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %47 = llvm.mlir.constant(0 : index) : i64 + %48 = llvm.insertvalue %47, %46[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %49 = llvm.insertvalue %36, %48[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %50 = llvm.insertvalue %37, %49[3, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %51 = llvm.insertvalue %37, %50[4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %52 = llvm.insertvalue %38, %51[4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %53 = llvm.mlir.constant(900 : index) : i64 + %54 = llvm.mlir.constant(64 : index) : i64 + %55 = llvm.mlir.constant(1 : index) : i64 + %56 = llvm.mlir.constant(57600 : index) : i64 + %57 = llvm.mlir.zero : !llvm.ptr + %58 = llvm.getelementptr %57[%56] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %59 = llvm.ptrtoint %58 : !llvm.ptr to i64 + %60 = llvm.call @malloc(%59) : (i64) -> !llvm.ptr + %61 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %62 = llvm.insertvalue %60, %61[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %63 = llvm.insertvalue %60, %62[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %64 = llvm.mlir.constant(0 : index) : i64 + %65 = llvm.insertvalue %64, %63[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %66 = llvm.insertvalue %53, %65[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %67 = llvm.insertvalue %54, %66[3, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %68 = llvm.insertvalue %54, %67[4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %69 = llvm.insertvalue %55, %68[4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %70 = llvm.mlir.constant(64 : index) : i64 + %71 = llvm.mlir.constant(1 : index) : i64 + %72 = llvm.mlir.zero : !llvm.ptr + %73 = llvm.getelementptr %72[%70] : (!llvm.ptr, i64) -> !llvm.ptr, i32 + %74 = llvm.ptrtoint %73 : !llvm.ptr to i64 + %75 = llvm.call @malloc(%74) : (i64) -> !llvm.ptr + %76 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + %77 = llvm.insertvalue %75, %76[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + %78 = llvm.insertvalue %75, %77[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + %79 = llvm.mlir.constant(0 : index) : i64 + %80 = llvm.insertvalue %79, %78[2] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + %81 = llvm.insertvalue %70, %80[3, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + %82 = llvm.insertvalue %71, %81[4, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + %83 = builtin.unrealized_conversion_cast %82 : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> to memref<64xi32> + %84 = llvm.mlir.constant(0 : i32) : i32 + linalg.fill ins(%84 : i32) outs(%83 : memref<64xi32>) + %85 = llvm.mlir.constant(0 : index) : i64 + %86 = llvm.mlir.constant(1 : index) : i64 + %87 = llvm.mlir.constant(3 : index) : i64 + llvm.br ^bb1(%85 : i64) + ^bb1(%88: i64): // 2 preds: ^bb0, ^bb11 + %89 = llvm.icmp "slt" %88, %87 : i64 + llvm.cond_br %89, ^bb2, ^bb12 + ^bb2: // pred: ^bb1 + %90 = llvm.mlir.constant(3 : index) : i64 + llvm.br ^bb3(%85 : i64) + ^bb3(%91: i64): // 2 preds: ^bb2, ^bb10 + %92 = llvm.icmp "slt" %91, %90 : i64 + llvm.cond_br %92, ^bb4, ^bb11 + ^bb4: // pred: ^bb3 + %93 = llvm.mlir.constant(32 : index) : i64 + llvm.br ^bb5(%85 : i64) + ^bb5(%94: i64): // 2 preds: ^bb4, ^bb9 + %95 = llvm.icmp "slt" %94, %93 : i64 + llvm.cond_br %95, ^bb6, ^bb10 + ^bb6: // pred: ^bb5 + %96 = llvm.mlir.constant(64 : index) : i64 + llvm.br ^bb7(%85 : i64) + ^bb7(%97: i64): // 2 preds: ^bb6, ^bb8 + %98 = llvm.icmp "slt" %97, %96 : i64 + llvm.cond_br %98, ^bb8, ^bb9 + ^bb8: // pred: ^bb7 + %99 = llvm.mlir.constant(3 : index) : i64 + %100 = llvm.mlir.constant(32 : index) : i64 + %101 = llvm.mul %88, %99 : i64 + %102 = llvm.mul %101, %100 : i64 + %103 = llvm.mul %91, %100 : i64 + %104 = llvm.add %102, %103 : i64 + %105 = llvm.add %104, %94 : i64 + %106 = llvm.extractvalue %35[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %107 = llvm.mlir.constant(6144 : index) : i64 + %108 = llvm.mul %88, %107 : i64 + %109 = llvm.mlir.constant(2048 : index) : i64 + %110 = llvm.mul %91, %109 : i64 + %111 = llvm.add %108, %110 : i64 + %112 = llvm.mlir.constant(64 : index) : i64 + %113 = llvm.mul %94, %112 : i64 + %114 = llvm.add %111, %113 : i64 + %115 = llvm.add %114, %97 : i64 + %116 = llvm.getelementptr %106[%115] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %117 = llvm.load %116 : !llvm.ptr -> f16 + %118 = llvm.extractvalue %52[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %119 = llvm.mlir.constant(64 : index) : i64 + %120 = llvm.mul %105, %119 : i64 + %121 = llvm.add %120, %97 : i64 + %122 = llvm.getelementptr %118[%121] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + llvm.store %117, %122 : f16, !llvm.ptr + %123 = llvm.add %97, %86 : i64 + llvm.br ^bb7(%123 : i64) + ^bb9: // pred: ^bb7 + %124 = llvm.add %94, %86 : i64 + llvm.br ^bb5(%124 : i64) + ^bb10: // pred: ^bb5 + %125 = llvm.add %91, %86 : i64 + llvm.br ^bb3(%125 : i64) + ^bb11: // pred: ^bb3 + %126 = llvm.add %88, %86 : i64 + llvm.br ^bb1(%126 : i64) + ^bb12: // pred: ^bb1 + %127 = llvm.mlir.constant(30 : i64) : i64 + %128 = llvm.mlir.constant(3 : i64) : i64 + %129 = llvm.extractvalue %23[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %130 = llvm.ptrtoint %129 : !llvm.ptr to i64 + %131 = llvm.extractvalue %69[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %132 = llvm.ptrtoint %131 : !llvm.ptr to i64 + %133 = llvm.extractvalue %82[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + %134 = llvm.ptrtoint %133 : !llvm.ptr to i64 + %135 = llvm.extractvalue %52[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %136 = llvm.ptrtoint %135 : !llvm.ptr to i64 + %137 = llvm.mlir.constant(64 : i64) : i64 + %138 = llvm.mlir.constant(2 : i64) : i64 + %139 = llvm.mlir.constant(4575657221408424000 : i64) : i64 + "gemmini.intr.config_st"(%138, %139) : (i64, i64) -> () + %140 = llvm.mlir.constant(65540 : i64) : i64 + %141 = llvm.mlir.constant(281474976710656 : i64) : i64 + "gemmini.intr.config_ex"(%140, %141) : (i64, i64) -> () + %142 = llvm.mlir.constant(0 : i64) : i64 + %143 = llvm.mlir.constant(0 : i64) : i64 + %144 = llvm.mlir.constant(0 : i64) : i64 + %145 = llvm.mlir.constant(0 : i64) : i64 + %146 = llvm.mlir.constant(18014535950532609 : i64) : i64 + %147 = llvm.mlir.constant(4296933406 : i64) : i64 + "gemmini.intr.loop_conv_ws_config1"(%146, %147) : (i64, i64) -> () + %148 = llvm.mlir.constant(844429225164800 : i64) : i64 + %149 = llvm.mlir.constant(281569467498512 : i64) : i64 + "gemmini.intr.loop_conv_ws_config2"(%148, %149) : (i64, i64) -> () + %150 = llvm.mlir.constant(844437817131008 : i64) : i64 + %151 = llvm.mlir.constant(0 : i64) : i64 + "gemmini.intr.loop_conv_ws_config3"(%150, %151) : (i64, i64) -> () + %152 = llvm.mlir.constant(6192449487634432 : i64) : i64 + %153 = llvm.mlir.constant(65559 : i64) : i64 + "gemmini.intr.loop_conv_ws_config4"(%152, %153) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config5"(%136, %132) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config6"(%134, %130) : (i64, i64) -> () + %154 = llvm.mlir.constant(256 : i64) : i64 + %155 = llvm.mlir.constant(1 : i64) : i64 + "gemmini.intr.loop_conv_ws"(%154, %155) : (i64, i64) -> () + %156 = llvm.mlir.constant(16 : i64) : i64 + %157 = llvm.add %132, %156 : i64 + %158 = llvm.mlir.constant(64 : i64) : i64 + %159 = llvm.add %134, %158 : i64 + %160 = llvm.mlir.constant(16 : i64) : i64 + %161 = llvm.add %136, %160 : i64 + %162 = llvm.mlir.constant(0 : i64) : i64 + %163 = llvm.mlir.constant(18014535950532609 : i64) : i64 + %164 = llvm.mlir.constant(4296933406 : i64) : i64 + "gemmini.intr.loop_conv_ws_config1"(%163, %164) : (i64, i64) -> () + %165 = llvm.mlir.constant(844429225164800 : i64) : i64 + %166 = llvm.mlir.constant(281569467498512 : i64) : i64 + "gemmini.intr.loop_conv_ws_config2"(%165, %166) : (i64, i64) -> () + %167 = llvm.mlir.constant(844437817131008 : i64) : i64 + %168 = llvm.mlir.constant(0 : i64) : i64 + "gemmini.intr.loop_conv_ws_config3"(%167, %168) : (i64, i64) -> () + %169 = llvm.mlir.constant(6192449487634432 : i64) : i64 + %170 = llvm.mlir.constant(65559 : i64) : i64 + "gemmini.intr.loop_conv_ws_config4"(%169, %170) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config5"(%161, %157) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config6"(%159, %130) : (i64, i64) -> () + %171 = llvm.mlir.constant(256 : i64) : i64 + %172 = llvm.mlir.constant(1 : i64) : i64 + "gemmini.intr.loop_conv_ws"(%171, %172) : (i64, i64) -> () + %173 = llvm.mlir.constant(32 : i64) : i64 + %174 = llvm.add %132, %173 : i64 + %175 = llvm.mlir.constant(128 : i64) : i64 + %176 = llvm.add %134, %175 : i64 + %177 = llvm.mlir.constant(32 : i64) : i64 + %178 = llvm.add %136, %177 : i64 + %179 = llvm.mlir.constant(0 : i64) : i64 + %180 = llvm.mlir.constant(18014535950532609 : i64) : i64 + %181 = llvm.mlir.constant(4296933406 : i64) : i64 + "gemmini.intr.loop_conv_ws_config1"(%180, %181) : (i64, i64) -> () + %182 = llvm.mlir.constant(844429225164800 : i64) : i64 + %183 = llvm.mlir.constant(281569467498512 : i64) : i64 + "gemmini.intr.loop_conv_ws_config2"(%182, %183) : (i64, i64) -> () + %184 = llvm.mlir.constant(844437817131008 : i64) : i64 + %185 = llvm.mlir.constant(0 : i64) : i64 + "gemmini.intr.loop_conv_ws_config3"(%184, %185) : (i64, i64) -> () + %186 = llvm.mlir.constant(6192449487634432 : i64) : i64 + %187 = llvm.mlir.constant(65559 : i64) : i64 + "gemmini.intr.loop_conv_ws_config4"(%186, %187) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config5"(%178, %174) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config6"(%176, %130) : (i64, i64) -> () + %188 = llvm.mlir.constant(256 : i64) : i64 + %189 = llvm.mlir.constant(1 : i64) : i64 + "gemmini.intr.loop_conv_ws"(%188, %189) : (i64, i64) -> () + %190 = llvm.mlir.constant(48 : i64) : i64 + %191 = llvm.add %132, %190 : i64 + %192 = llvm.mlir.constant(192 : i64) : i64 + %193 = llvm.add %134, %192 : i64 + %194 = llvm.mlir.constant(48 : i64) : i64 + %195 = llvm.add %136, %194 : i64 + %196 = llvm.mlir.constant(0 : i64) : i64 + %197 = llvm.mlir.constant(18014535950532609 : i64) : i64 + %198 = llvm.mlir.constant(4296933406 : i64) : i64 + "gemmini.intr.loop_conv_ws_config1"(%197, %198) : (i64, i64) -> () + %199 = llvm.mlir.constant(844429225164800 : i64) : i64 + %200 = llvm.mlir.constant(281569467498512 : i64) : i64 + "gemmini.intr.loop_conv_ws_config2"(%199, %200) : (i64, i64) -> () + %201 = llvm.mlir.constant(844437817131008 : i64) : i64 + %202 = llvm.mlir.constant(0 : i64) : i64 + "gemmini.intr.loop_conv_ws_config3"(%201, %202) : (i64, i64) -> () + %203 = llvm.mlir.constant(6192449487634432 : i64) : i64 + %204 = llvm.mlir.constant(65559 : i64) : i64 + "gemmini.intr.loop_conv_ws_config4"(%203, %204) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config5"(%195, %191) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config6"(%193, %130) : (i64, i64) -> () + %205 = llvm.mlir.constant(256 : i64) : i64 + %206 = llvm.mlir.constant(1 : i64) : i64 + "gemmini.intr.loop_conv_ws"(%205, %206) : (i64, i64) -> () + %207 = llvm.mlir.constant(1472 : i64) : i64 + %208 = llvm.add %132, %207 : i64 + %209 = llvm.mlir.constant(0 : i64) : i64 + %210 = llvm.mlir.constant(0 : i64) : i64 + %211 = llvm.mlir.constant(736 : i64) : i64 + %212 = llvm.add %130, %211 : i64 + %213 = llvm.mlir.constant(18014535950532609 : i64) : i64 + %214 = llvm.mlir.constant(4296933406 : i64) : i64 + "gemmini.intr.loop_conv_ws_config1"(%213, %214) : (i64, i64) -> () + %215 = llvm.mlir.constant(844429225164800 : i64) : i64 + %216 = llvm.mlir.constant(281569466449936 : i64) : i64 + "gemmini.intr.loop_conv_ws_config2"(%215, %216) : (i64, i64) -> () + %217 = llvm.mlir.constant(844437817131008 : i64) : i64 + %218 = llvm.mlir.constant(0 : i64) : i64 + "gemmini.intr.loop_conv_ws_config3"(%217, %218) : (i64, i64) -> () + %219 = llvm.mlir.constant(6192449487634432 : i64) : i64 + %220 = llvm.mlir.constant(65543 : i64) : i64 + "gemmini.intr.loop_conv_ws_config4"(%219, %220) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config5"(%136, %208) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config6"(%134, %212) : (i64, i64) -> () + %221 = llvm.mlir.constant(256 : i64) : i64 + %222 = llvm.mlir.constant(1 : i64) : i64 + "gemmini.intr.loop_conv_ws"(%221, %222) : (i64, i64) -> () + %223 = llvm.mlir.constant(1488 : i64) : i64 + %224 = llvm.add %132, %223 : i64 + %225 = llvm.mlir.constant(64 : i64) : i64 + %226 = llvm.add %134, %225 : i64 + %227 = llvm.mlir.constant(16 : i64) : i64 + %228 = llvm.add %136, %227 : i64 + %229 = llvm.mlir.constant(736 : i64) : i64 + %230 = llvm.add %130, %229 : i64 + %231 = llvm.mlir.constant(18014535950532609 : i64) : i64 + %232 = llvm.mlir.constant(4296933406 : i64) : i64 + "gemmini.intr.loop_conv_ws_config1"(%231, %232) : (i64, i64) -> () + %233 = llvm.mlir.constant(844429225164800 : i64) : i64 + %234 = llvm.mlir.constant(281569466449936 : i64) : i64 + "gemmini.intr.loop_conv_ws_config2"(%233, %234) : (i64, i64) -> () + %235 = llvm.mlir.constant(844437817131008 : i64) : i64 + %236 = llvm.mlir.constant(0 : i64) : i64 + "gemmini.intr.loop_conv_ws_config3"(%235, %236) : (i64, i64) -> () + %237 = llvm.mlir.constant(6192449487634432 : i64) : i64 + %238 = llvm.mlir.constant(65543 : i64) : i64 + "gemmini.intr.loop_conv_ws_config4"(%237, %238) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config5"(%228, %224) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config6"(%226, %230) : (i64, i64) -> () + %239 = llvm.mlir.constant(256 : i64) : i64 + %240 = llvm.mlir.constant(1 : i64) : i64 + "gemmini.intr.loop_conv_ws"(%239, %240) : (i64, i64) -> () + %241 = llvm.mlir.constant(1504 : i64) : i64 + %242 = llvm.add %132, %241 : i64 + %243 = llvm.mlir.constant(128 : i64) : i64 + %244 = llvm.add %134, %243 : i64 + %245 = llvm.mlir.constant(32 : i64) : i64 + %246 = llvm.add %136, %245 : i64 + %247 = llvm.mlir.constant(736 : i64) : i64 + %248 = llvm.add %130, %247 : i64 + %249 = llvm.mlir.constant(18014535950532609 : i64) : i64 + %250 = llvm.mlir.constant(4296933406 : i64) : i64 + "gemmini.intr.loop_conv_ws_config1"(%249, %250) : (i64, i64) -> () + %251 = llvm.mlir.constant(844429225164800 : i64) : i64 + %252 = llvm.mlir.constant(281569466449936 : i64) : i64 + "gemmini.intr.loop_conv_ws_config2"(%251, %252) : (i64, i64) -> () + %253 = llvm.mlir.constant(844437817131008 : i64) : i64 + %254 = llvm.mlir.constant(0 : i64) : i64 + "gemmini.intr.loop_conv_ws_config3"(%253, %254) : (i64, i64) -> () + %255 = llvm.mlir.constant(6192449487634432 : i64) : i64 + %256 = llvm.mlir.constant(65543 : i64) : i64 + "gemmini.intr.loop_conv_ws_config4"(%255, %256) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config5"(%246, %242) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config6"(%244, %248) : (i64, i64) -> () + %257 = llvm.mlir.constant(256 : i64) : i64 + %258 = llvm.mlir.constant(1 : i64) : i64 + "gemmini.intr.loop_conv_ws"(%257, %258) : (i64, i64) -> () + %259 = llvm.mlir.constant(1520 : i64) : i64 + %260 = llvm.add %132, %259 : i64 + %261 = llvm.mlir.constant(192 : i64) : i64 + %262 = llvm.add %134, %261 : i64 + %263 = llvm.mlir.constant(48 : i64) : i64 + %264 = llvm.add %136, %263 : i64 + %265 = llvm.mlir.constant(736 : i64) : i64 + %266 = llvm.add %130, %265 : i64 + %267 = llvm.mlir.constant(18014535950532609 : i64) : i64 + %268 = llvm.mlir.constant(4296933406 : i64) : i64 + "gemmini.intr.loop_conv_ws_config1"(%267, %268) : (i64, i64) -> () + %269 = llvm.mlir.constant(844429225164800 : i64) : i64 + %270 = llvm.mlir.constant(281569466449936 : i64) : i64 + "gemmini.intr.loop_conv_ws_config2"(%269, %270) : (i64, i64) -> () + %271 = llvm.mlir.constant(844437817131008 : i64) : i64 + %272 = llvm.mlir.constant(0 : i64) : i64 + "gemmini.intr.loop_conv_ws_config3"(%271, %272) : (i64, i64) -> () + %273 = llvm.mlir.constant(6192449487634432 : i64) : i64 + %274 = llvm.mlir.constant(65543 : i64) : i64 + "gemmini.intr.loop_conv_ws_config4"(%273, %274) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config5"(%264, %260) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config6"(%262, %266) : (i64, i64) -> () + %275 = llvm.mlir.constant(256 : i64) : i64 + %276 = llvm.mlir.constant(1 : i64) : i64 + "gemmini.intr.loop_conv_ws"(%275, %276) : (i64, i64) -> () + %277 = llvm.mlir.constant(42240 : i64) : i64 + %278 = llvm.add %132, %277 : i64 + %279 = llvm.mlir.constant(0 : i64) : i64 + %280 = llvm.mlir.constant(0 : i64) : i64 + %281 = llvm.mlir.constant(22528 : i64) : i64 + %282 = llvm.add %130, %281 : i64 + %283 = llvm.mlir.constant(18014535950532609 : i64) : i64 + %284 = llvm.mlir.constant(4296933406 : i64) : i64 + "gemmini.intr.loop_conv_ws_config1"(%283, %284) : (i64, i64) -> () + %285 = llvm.mlir.constant(844429225164800 : i64) : i64 + %286 = llvm.mlir.constant(281509337956368 : i64) : i64 + "gemmini.intr.loop_conv_ws_config2"(%285, %286) : (i64, i64) -> () + %287 = llvm.mlir.constant(844437817131008 : i64) : i64 + %288 = llvm.mlir.constant(0 : i64) : i64 + "gemmini.intr.loop_conv_ws_config3"(%287, %288) : (i64, i64) -> () + %289 = llvm.mlir.constant(2251799813685248 : i64) : i64 + %290 = llvm.mlir.constant(65559 : i64) : i64 + "gemmini.intr.loop_conv_ws_config4"(%289, %290) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config5"(%136, %278) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config6"(%134, %282) : (i64, i64) -> () + %291 = llvm.mlir.constant(256 : i64) : i64 + %292 = llvm.mlir.constant(1 : i64) : i64 + "gemmini.intr.loop_conv_ws"(%291, %292) : (i64, i64) -> () + %293 = llvm.mlir.constant(42256 : i64) : i64 + %294 = llvm.add %132, %293 : i64 + %295 = llvm.mlir.constant(64 : i64) : i64 + %296 = llvm.add %134, %295 : i64 + %297 = llvm.mlir.constant(16 : i64) : i64 + %298 = llvm.add %136, %297 : i64 + %299 = llvm.mlir.constant(22528 : i64) : i64 + %300 = llvm.add %130, %299 : i64 + %301 = llvm.mlir.constant(18014535950532609 : i64) : i64 + %302 = llvm.mlir.constant(4296933406 : i64) : i64 + "gemmini.intr.loop_conv_ws_config1"(%301, %302) : (i64, i64) -> () + %303 = llvm.mlir.constant(844429225164800 : i64) : i64 + %304 = llvm.mlir.constant(281509337956368 : i64) : i64 + "gemmini.intr.loop_conv_ws_config2"(%303, %304) : (i64, i64) -> () + %305 = llvm.mlir.constant(844437817131008 : i64) : i64 + %306 = llvm.mlir.constant(0 : i64) : i64 + "gemmini.intr.loop_conv_ws_config3"(%305, %306) : (i64, i64) -> () + %307 = llvm.mlir.constant(2251799813685248 : i64) : i64 + %308 = llvm.mlir.constant(65559 : i64) : i64 + "gemmini.intr.loop_conv_ws_config4"(%307, %308) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config5"(%298, %294) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config6"(%296, %300) : (i64, i64) -> () + %309 = llvm.mlir.constant(256 : i64) : i64 + %310 = llvm.mlir.constant(1 : i64) : i64 + "gemmini.intr.loop_conv_ws"(%309, %310) : (i64, i64) -> () + %311 = llvm.mlir.constant(42272 : i64) : i64 + %312 = llvm.add %132, %311 : i64 + %313 = llvm.mlir.constant(128 : i64) : i64 + %314 = llvm.add %134, %313 : i64 + %315 = llvm.mlir.constant(32 : i64) : i64 + %316 = llvm.add %136, %315 : i64 + %317 = llvm.mlir.constant(22528 : i64) : i64 + %318 = llvm.add %130, %317 : i64 + %319 = llvm.mlir.constant(18014535950532609 : i64) : i64 + %320 = llvm.mlir.constant(4296933406 : i64) : i64 + "gemmini.intr.loop_conv_ws_config1"(%319, %320) : (i64, i64) -> () + %321 = llvm.mlir.constant(844429225164800 : i64) : i64 + %322 = llvm.mlir.constant(281509337956368 : i64) : i64 + "gemmini.intr.loop_conv_ws_config2"(%321, %322) : (i64, i64) -> () + %323 = llvm.mlir.constant(844437817131008 : i64) : i64 + %324 = llvm.mlir.constant(0 : i64) : i64 + "gemmini.intr.loop_conv_ws_config3"(%323, %324) : (i64, i64) -> () + %325 = llvm.mlir.constant(2251799813685248 : i64) : i64 + %326 = llvm.mlir.constant(65559 : i64) : i64 + "gemmini.intr.loop_conv_ws_config4"(%325, %326) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config5"(%316, %312) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config6"(%314, %318) : (i64, i64) -> () + %327 = llvm.mlir.constant(256 : i64) : i64 + %328 = llvm.mlir.constant(1 : i64) : i64 + "gemmini.intr.loop_conv_ws"(%327, %328) : (i64, i64) -> () + %329 = llvm.mlir.constant(42288 : i64) : i64 + %330 = llvm.add %132, %329 : i64 + %331 = llvm.mlir.constant(192 : i64) : i64 + %332 = llvm.add %134, %331 : i64 + %333 = llvm.mlir.constant(48 : i64) : i64 + %334 = llvm.add %136, %333 : i64 + %335 = llvm.mlir.constant(22528 : i64) : i64 + %336 = llvm.add %130, %335 : i64 + %337 = llvm.mlir.constant(18014535950532609 : i64) : i64 + %338 = llvm.mlir.constant(4296933406 : i64) : i64 + "gemmini.intr.loop_conv_ws_config1"(%337, %338) : (i64, i64) -> () + %339 = llvm.mlir.constant(844429225164800 : i64) : i64 + %340 = llvm.mlir.constant(281509337956368 : i64) : i64 + "gemmini.intr.loop_conv_ws_config2"(%339, %340) : (i64, i64) -> () + %341 = llvm.mlir.constant(844437817131008 : i64) : i64 + %342 = llvm.mlir.constant(0 : i64) : i64 + "gemmini.intr.loop_conv_ws_config3"(%341, %342) : (i64, i64) -> () + %343 = llvm.mlir.constant(2251799813685248 : i64) : i64 + %344 = llvm.mlir.constant(65559 : i64) : i64 + "gemmini.intr.loop_conv_ws_config4"(%343, %344) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config5"(%334, %330) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config6"(%332, %336) : (i64, i64) -> () + %345 = llvm.mlir.constant(256 : i64) : i64 + %346 = llvm.mlir.constant(1 : i64) : i64 + "gemmini.intr.loop_conv_ws"(%345, %346) : (i64, i64) -> () + %347 = llvm.mlir.constant(43712 : i64) : i64 + %348 = llvm.add %132, %347 : i64 + %349 = llvm.mlir.constant(0 : i64) : i64 + %350 = llvm.mlir.constant(0 : i64) : i64 + %351 = llvm.mlir.constant(23264 : i64) : i64 + %352 = llvm.add %130, %351 : i64 + %353 = llvm.mlir.constant(18014535950532609 : i64) : i64 + %354 = llvm.mlir.constant(4296933406 : i64) : i64 + "gemmini.intr.loop_conv_ws_config1"(%353, %354) : (i64, i64) -> () + %355 = llvm.mlir.constant(844429225164800 : i64) : i64 + %356 = llvm.mlir.constant(281509336907792 : i64) : i64 + "gemmini.intr.loop_conv_ws_config2"(%355, %356) : (i64, i64) -> () + %357 = llvm.mlir.constant(844437817131008 : i64) : i64 + %358 = llvm.mlir.constant(0 : i64) : i64 + "gemmini.intr.loop_conv_ws_config3"(%357, %358) : (i64, i64) -> () + %359 = llvm.mlir.constant(2251799813685248 : i64) : i64 + %360 = llvm.mlir.constant(65543 : i64) : i64 + "gemmini.intr.loop_conv_ws_config4"(%359, %360) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config5"(%136, %348) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config6"(%134, %352) : (i64, i64) -> () + %361 = llvm.mlir.constant(256 : i64) : i64 + %362 = llvm.mlir.constant(1 : i64) : i64 + "gemmini.intr.loop_conv_ws"(%361, %362) : (i64, i64) -> () + %363 = llvm.mlir.constant(43728 : i64) : i64 + %364 = llvm.add %132, %363 : i64 + %365 = llvm.mlir.constant(64 : i64) : i64 + %366 = llvm.add %134, %365 : i64 + %367 = llvm.mlir.constant(16 : i64) : i64 + %368 = llvm.add %136, %367 : i64 + %369 = llvm.mlir.constant(23264 : i64) : i64 + %370 = llvm.add %130, %369 : i64 + %371 = llvm.mlir.constant(18014535950532609 : i64) : i64 + %372 = llvm.mlir.constant(4296933406 : i64) : i64 + "gemmini.intr.loop_conv_ws_config1"(%371, %372) : (i64, i64) -> () + %373 = llvm.mlir.constant(844429225164800 : i64) : i64 + %374 = llvm.mlir.constant(281509336907792 : i64) : i64 + "gemmini.intr.loop_conv_ws_config2"(%373, %374) : (i64, i64) -> () + %375 = llvm.mlir.constant(844437817131008 : i64) : i64 + %376 = llvm.mlir.constant(0 : i64) : i64 + "gemmini.intr.loop_conv_ws_config3"(%375, %376) : (i64, i64) -> () + %377 = llvm.mlir.constant(2251799813685248 : i64) : i64 + %378 = llvm.mlir.constant(65543 : i64) : i64 + "gemmini.intr.loop_conv_ws_config4"(%377, %378) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config5"(%368, %364) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config6"(%366, %370) : (i64, i64) -> () + %379 = llvm.mlir.constant(256 : i64) : i64 + %380 = llvm.mlir.constant(1 : i64) : i64 + "gemmini.intr.loop_conv_ws"(%379, %380) : (i64, i64) -> () + %381 = llvm.mlir.constant(43744 : i64) : i64 + %382 = llvm.add %132, %381 : i64 + %383 = llvm.mlir.constant(128 : i64) : i64 + %384 = llvm.add %134, %383 : i64 + %385 = llvm.mlir.constant(32 : i64) : i64 + %386 = llvm.add %136, %385 : i64 + %387 = llvm.mlir.constant(23264 : i64) : i64 + %388 = llvm.add %130, %387 : i64 + %389 = llvm.mlir.constant(18014535950532609 : i64) : i64 + %390 = llvm.mlir.constant(4296933406 : i64) : i64 + "gemmini.intr.loop_conv_ws_config1"(%389, %390) : (i64, i64) -> () + %391 = llvm.mlir.constant(844429225164800 : i64) : i64 + %392 = llvm.mlir.constant(281509336907792 : i64) : i64 + "gemmini.intr.loop_conv_ws_config2"(%391, %392) : (i64, i64) -> () + %393 = llvm.mlir.constant(844437817131008 : i64) : i64 + %394 = llvm.mlir.constant(0 : i64) : i64 + "gemmini.intr.loop_conv_ws_config3"(%393, %394) : (i64, i64) -> () + %395 = llvm.mlir.constant(2251799813685248 : i64) : i64 + %396 = llvm.mlir.constant(65543 : i64) : i64 + "gemmini.intr.loop_conv_ws_config4"(%395, %396) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config5"(%386, %382) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config6"(%384, %388) : (i64, i64) -> () + %397 = llvm.mlir.constant(256 : i64) : i64 + %398 = llvm.mlir.constant(1 : i64) : i64 + "gemmini.intr.loop_conv_ws"(%397, %398) : (i64, i64) -> () + %399 = llvm.mlir.constant(43760 : i64) : i64 + %400 = llvm.add %132, %399 : i64 + %401 = llvm.mlir.constant(192 : i64) : i64 + %402 = llvm.add %134, %401 : i64 + %403 = llvm.mlir.constant(48 : i64) : i64 + %404 = llvm.add %136, %403 : i64 + %405 = llvm.mlir.constant(23264 : i64) : i64 + %406 = llvm.add %130, %405 : i64 + %407 = llvm.mlir.constant(18014535950532609 : i64) : i64 + %408 = llvm.mlir.constant(4296933406 : i64) : i64 + "gemmini.intr.loop_conv_ws_config1"(%407, %408) : (i64, i64) -> () + %409 = llvm.mlir.constant(844429225164800 : i64) : i64 + %410 = llvm.mlir.constant(281509336907792 : i64) : i64 + "gemmini.intr.loop_conv_ws_config2"(%409, %410) : (i64, i64) -> () + %411 = llvm.mlir.constant(844437817131008 : i64) : i64 + %412 = llvm.mlir.constant(0 : i64) : i64 + "gemmini.intr.loop_conv_ws_config3"(%411, %412) : (i64, i64) -> () + %413 = llvm.mlir.constant(2251799813685248 : i64) : i64 + %414 = llvm.mlir.constant(65543 : i64) : i64 + "gemmini.intr.loop_conv_ws_config4"(%413, %414) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config5"(%404, %400) : (i64, i64) -> () + "gemmini.intr.loop_conv_ws_config6"(%402, %406) : (i64, i64) -> () + %415 = llvm.mlir.constant(256 : i64) : i64 + %416 = llvm.mlir.constant(1 : i64) : i64 + "gemmini.intr.loop_conv_ws"(%415, %416) : (i64, i64) -> () + %417 = llvm.mlir.constant(0 : i64) : i64 + "gemmini.intr.flush"(%417, %417) : (i64, i64) -> () + %418 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb13(%85 : i64) + ^bb13(%419: i64): // 2 preds: ^bb12, ^bb23 + %420 = llvm.icmp "slt" %419, %418 : i64 + llvm.cond_br %420, ^bb14, ^bb24 + ^bb14: // pred: ^bb13 + %421 = llvm.mlir.constant(30 : index) : i64 + llvm.br ^bb15(%85 : i64) + ^bb15(%422: i64): // 2 preds: ^bb14, ^bb22 + %423 = llvm.icmp "slt" %422, %421 : i64 + llvm.cond_br %423, ^bb16, ^bb23 + ^bb16: // pred: ^bb15 + %424 = llvm.mlir.constant(30 : index) : i64 + llvm.br ^bb17(%85 : i64) + ^bb17(%425: i64): // 2 preds: ^bb16, ^bb21 + %426 = llvm.icmp "slt" %425, %424 : i64 + llvm.cond_br %426, ^bb18, ^bb22 + ^bb18: // pred: ^bb17 + %427 = llvm.mlir.constant(64 : index) : i64 + llvm.br ^bb19(%85 : i64) + ^bb19(%428: i64): // 2 preds: ^bb18, ^bb20 + %429 = llvm.icmp "slt" %428, %427 : i64 + llvm.cond_br %429, ^bb20, ^bb21 + ^bb20: // pred: ^bb19 + %430 = llvm.mlir.constant(30 : index) : i64 + %431 = llvm.mul %419, %430 : i64 + %432 = llvm.mul %431, %430 : i64 + %433 = llvm.mul %422, %430 : i64 + %434 = llvm.add %432, %433 : i64 + %435 = llvm.add %434, %425 : i64 + %436 = llvm.extractvalue %69[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %437 = llvm.mlir.constant(64 : index) : i64 + %438 = llvm.mul %435, %437 : i64 + %439 = llvm.add %438, %428 : i64 + %440 = llvm.getelementptr %436[%439] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %441 = llvm.load %440 : !llvm.ptr -> f32 + %442 = llvm.extractvalue %11[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %443 = llvm.mlir.constant(57600 : index) : i64 + %444 = llvm.mul %419, %443 : i64 + %445 = llvm.mlir.constant(1920 : index) : i64 + %446 = llvm.mul %422, %445 : i64 + %447 = llvm.add %444, %446 : i64 + %448 = llvm.mlir.constant(64 : index) : i64 + %449 = llvm.mul %425, %448 : i64 + %450 = llvm.add %447, %449 : i64 + %451 = llvm.add %450, %428 : i64 + %452 = llvm.getelementptr %442[%451] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.store %441, %452 : f32, !llvm.ptr + %453 = llvm.add %428, %86 : i64 + llvm.br ^bb19(%453 : i64) + ^bb21: // pred: ^bb19 + %454 = llvm.add %425, %86 : i64 + llvm.br ^bb17(%454 : i64) + ^bb22: // pred: ^bb17 + %455 = llvm.add %422, %86 : i64 + llvm.br ^bb15(%455 : i64) + ^bb23: // pred: ^bb15 + %456 = llvm.add %419, %86 : i64 + llvm.br ^bb13(%456 : i64) + ^bb24: // pred: ^bb13 + %457 = llvm.extractvalue %52[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + llvm.call @free(%457) : (!llvm.ptr) -> () + %458 = llvm.extractvalue %69[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + llvm.call @free(%458) : (!llvm.ptr) -> () + %459 = llvm.extractvalue %82[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + llvm.call @free(%459) : (!llvm.ptr) -> () + llvm.return + } +} + +