1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98
| func.func @main(%arg0: !disc_ral.context) attributes {tf.entry_function = {input_placements = "gpu", inputs = "input.1_", output_placements = "gpu", outputs = "8"}} { %0 = llvm.mlir.constant(0 : i32) : i32 %false = arith.constant false %true = arith.constant true %c1 = arith.constant 1 : index %c4 = arith.constant 4 : index %c10 = arith.constant 10 : index %c0 = arith.constant 0 : index %1 = "disc_ral.dispatch"(%arg0, %c0) {backend_config = "", call_target_name = "ral_recv_input", device = "cpu", has_side_effect = false} : (!disc_ral.context, index) -> memref<?x10xf32, #gpu.address_space<global>> %dim = memref.dim %1, %c0 : memref<?x10xf32, #gpu.address_space<global>> %alloc = memref.alloc() : memref<10x10xf32, #gpu.address_space<global>> "lmhlo.constant"(%alloc) {disc.device = "gpu", value = dense_resource<__elided__> : tensor<10x10xf32>} : (memref<10x10xf32, #gpu.address_space<global>>) -> () %alloc_0 = memref.alloc() : memref<10x10xf32, #gpu.address_space<global>> "lmhlo.constant"(%alloc_0) {disc.device = "gpu", value = dense_resource<__elided__> : tensor<10x10xf32>} : (memref<10x10xf32, #gpu.address_space<global>>) -> () %alloc_1 = memref.alloc() : memref<10xf32, #gpu.address_space<global>> "lmhlo.constant"(%alloc_1) {disc.device = "gpu", value = dense<[0.186997086, 0.235856801, 0.217500299, 0.25940907, 0.109970599, -0.152944937, 0.137896746, -0.189537019, 0.256005555, 0.235299528]> : tensor<10xf32>} : (memref<10xf32, #gpu.address_space<global>>) -> () %alloc_2 = memref.alloc() : memref<10xf32, #gpu.address_space<global>> "lmhlo.constant"(%alloc_2) {disc.device = "gpu", value = dense<[0.281509697, -0.0671350583, -0.291665494, 0.300998032, -0.304899603, 0.23629041, -0.111676671, 0.304613203, 0.107744612, -0.118951075]> : tensor<10xf32>} : (memref<10xf32, #gpu.address_space<global>>) -> () %reinterpret_cast = memref.reinterpret_cast %1 to offset: [0], sizes: [%dim, 10], strides: [10, 1] {kDiscSymbolicDimAttr = [@S0, @C10]} : memref<?x10xf32, #gpu.address_space<global>> to memref<?x10xf32, #gpu.address_space<global>> %alloc_3 = memref.alloc(%dim) {kDiscSymbolicDimAttr = [@S0, @C10]} : memref<?x10xf32, #gpu.address_space<global>> %2 = llvm.inttoptr %0 : i32 to !llvm.ptr<i8> "disc_ral.dispatch"(%arg0, %2, %reinterpret_cast, %alloc_0, %alloc_3, %false, %false, %true) {backend_config = "", call_target_name = "ral_gemm", device = "gpu", has_side_effect = false} : (!disc_ral.context, !llvm.ptr<i8>, memref<?x10xf32, #gpu.address_space<global>>, memref<10x10xf32, #gpu.address_space<global>>, memref<?x10xf32, #gpu.address_space<global>>, i1, i1, i1) -> () memref.dealloc %alloc_0 : memref<10x10xf32, #gpu.address_space<global>> %alloca = memref.alloca() {alignment = 64 : i64} : memref<2xindex> memref.store %dim, %alloca[%c0] : memref<2xindex> memref.store %c10, %alloca[%c1] : memref<2xindex> %3 = arith.muli %dim, %c10 : index %4 = arith.remui %3, %c4 : index %5 = arith.cmpi eq, %4, %c0 : index %alloc_4 = memref.alloc() : memref<f32, #gpu.address_space<global>> %alloc_5 = memref.alloc(%dim) {kDiscSymbolicDimAttr = [@S0, @C10]} : memref<?x10xf32, #gpu.address_space<global>> %alloc_6 = memref.alloc(%dim) {kDiscSymbolicDimAttr = [@S0, @C10]} : memref<?x10xf32, #gpu.address_space<global>> %alloc_7 = memref.alloc(%dim) {kDiscSymbolicDimAttr = [@S0, @C10]} : memref<?x10xf32, #gpu.address_space<global>> %alloc_8 = memref.alloc(%dim) {kDiscSymbolicDimAttr = [@S0, @C10]} : memref<?x10xf32, #gpu.address_space<global>> scf.if %5 { "lmhlo.fusion"() ({ "lmhlo.constant"(%alloc_4) {disc.device = "gpu", value = dense<0.000000e+00> : tensor<f32>} : (memref<f32, #gpu.address_space<global>>) -> () "lmhlo.dynamic_broadcast_in_dim"(%alloc_1, %alloca, %alloc_5) {broadcast_dimensions = dense<1> : tensor<1xi64>, disc.device = "gpu"} : (memref<10xf32, #gpu.address_space<global>>, memref<2xindex>, memref<?x10xf32, #gpu.address_space<global>>) -> () "lmhlo.add"(%alloc_3, %alloc_5, %alloc_6) {disc.device = "gpu"} : (memref<?x10xf32, #gpu.address_space<global>>, memref<?x10xf32, #gpu.address_space<global>>, memref<?x10xf32, #gpu.address_space<global>>) -> () "lmhlo.dynamic_broadcast_in_dim"(%alloc_4, %alloca, %alloc_8) {broadcast_dimensions = dense<> : tensor<0xi64>, disc.device = "gpu"} : (memref<f32, #gpu.address_space<global>>, memref<2xindex>, memref<?x10xf32, #gpu.address_space<global>>) -> () "lmhlo.maximum"(%alloc_6, %alloc_8, %alloc_7) {disc.device = "gpu"} : (memref<?x10xf32, #gpu.address_space<global>>, memref<?x10xf32, #gpu.address_space<global>>, memref<?x10xf32, #gpu.address_space<global>>) -> () "lmhlo.terminator"() : () -> () }) {disc.device = "gpu", disc.fusion.name = "main_kLoop_maximum__5_1_0", disc.fusion.tag = "Vec4", disc.fusion_type = "kLoop", disc_vectorize_or_tile_hint = 4 : i32} : () -> () } else { "lmhlo.fusion"() ({ "lmhlo.constant"(%alloc_4) {disc.device = "gpu", value = dense<0.000000e+00> : tensor<f32>} : (memref<f32, #gpu.address_space<global>>) -> () "lmhlo.dynamic_broadcast_in_dim"(%alloc_1, %alloca, %alloc_5) {broadcast_dimensions = dense<1> : tensor<1xi64>, disc.device = "gpu"} : (memref<10xf32, #gpu.address_space<global>>, memref<2xindex>, memref<?x10xf32, #gpu.address_space<global>>) -> () "lmhlo.add"(%alloc_3, %alloc_5, %alloc_6) {disc.device = "gpu"} : (memref<?x10xf32, #gpu.address_space<global>>, memref<?x10xf32, #gpu.address_space<global>>, memref<?x10xf32, #gpu.address_space<global>>) -> () "lmhlo.dynamic_broadcast_in_dim"(%alloc_4, %alloca, %alloc_8) {broadcast_dimensions = dense<> : tensor<0xi64>, disc.device = "gpu"} : (memref<f32, #gpu.address_space<global>>, memref<2xindex>, memref<?x10xf32, #gpu.address_space<global>>) -> () "lmhlo.maximum"(%alloc_6, %alloc_8, %alloc_7) {disc.device = "gpu"} : (memref<?x10xf32, #gpu.address_space<global>>, memref<?x10xf32, #gpu.address_space<global>>, memref<?x10xf32, #gpu.address_space<global>>) -> () "lmhlo.terminator"() : () -> () }) {disc.device = "gpu", disc.fusion.name = "main_kLoop_maximum__5_1_0", disc.fusion_type = "kLoop", disc_vectorize_or_tile_hint = 1 : i32} : () -> () } memref.dealloc %alloc_8 : memref<?x10xf32, #gpu.address_space<global>> memref.dealloc %alloc_6 : memref<?x10xf32, #gpu.address_space<global>> memref.dealloc %alloc_5 : memref<?x10xf32, #gpu.address_space<global>> memref.dealloc %alloc_4 : memref<f32, #gpu.address_space<global>> memref.dealloc %alloc_3 : memref<?x10xf32, #gpu.address_space<global>> memref.dealloc %alloc_1 : memref<10xf32, #gpu.address_space<global>> %alloc_9 = memref.alloc(%dim) {kDiscSymbolicDimAttr = [@S0, @C10]} : memref<?x10xf32, #gpu.address_space<global>> %6 = llvm.inttoptr %0 : i32 to !llvm.ptr<i8> "disc_ral.dispatch"(%arg0, %6, %alloc_7, %alloc, %alloc_9, %false, %false, %true) {backend_config = "", call_target_name = "ral_gemm", device = "gpu", has_side_effect = false} : (!disc_ral.context, !llvm.ptr<i8>, memref<?x10xf32, #gpu.address_space<global>>, memref<10x10xf32, #gpu.address_space<global>>, memref<?x10xf32, #gpu.address_space<global>>, i1, i1, i1) -> () memref.dealloc %alloc_7 : memref<?x10xf32, #gpu.address_space<global>> memref.dealloc %alloc : memref<10x10xf32, #gpu.address_space<global>> %alloc_10 = memref.alloc() : memref<f32, #gpu.address_space<global>> %alloc_11 = memref.alloc(%dim) {kDiscSymbolicDimAttr = [@S0, @C10]} : memref<?x10xf32, #gpu.address_space<global>> %alloc_12 = memref.alloc(%dim) {kDiscSymbolicDimAttr = [@S0, @C10]} : memref<?x10xf32, #gpu.address_space<global>> %alloc_13 = memref.alloc(%dim) {kDiscSymbolicDimAttr = [@S0, @C10]} : memref<?x10xf32, #gpu.address_space<global>> %alloc_14 = memref.alloc(%dim) {kDiscSymbolicDimAttr = [@S0, @C10]} : memref<?x10xf32, #gpu.address_space<global>> scf.if %5 { "lmhlo.fusion"() ({ "lmhlo.constant"(%alloc_10) {value = dense<0.000000e+00> : tensor<f32>} : (memref<f32, #gpu.address_space<global>>) -> () "lmhlo.dynamic_broadcast_in_dim"(%alloc_10, %alloca, %alloc_11) {broadcast_dimensions = dense<> : tensor<0xi64>, disc.device = "gpu"} : (memref<f32, #gpu.address_space<global>>, memref<2xindex>, memref<?x10xf32, #gpu.address_space<global>>) -> () "lmhlo.dynamic_broadcast_in_dim"(%alloc_2, %alloca, %alloc_12) {broadcast_dimensions = dense<1> : tensor<1xi64>, disc.device = "gpu"} : (memref<10xf32, #gpu.address_space<global>>, memref<2xindex>, memref<?x10xf32, #gpu.address_space<global>>) -> () "lmhlo.add"(%alloc_9, %alloc_12, %alloc_13) {disc.device = "gpu"} : (memref<?x10xf32, #gpu.address_space<global>>, memref<?x10xf32, #gpu.address_space<global>>, memref<?x10xf32, #gpu.address_space<global>>) -> () "lmhlo.maximum"(%alloc_13, %alloc_11, %alloc_14) {disc.device = "gpu"} : (memref<?x10xf32, #gpu.address_space<global>>, memref<?x10xf32, #gpu.address_space<global>>, memref<?x10xf32, #gpu.address_space<global>>) -> () "lmhlo.terminator"() : () -> () }) {disc.device = "gpu", disc.fusion.name = "main_kLoop_maximum__5_1_1", disc.fusion.tag = "Vec4", disc.fusion_type = "kLoop", disc_vectorize_or_tile_hint = 4 : i32} : () -> () } else { "lmhlo.fusion"() ({ "lmhlo.constant"(%alloc_10) {value = dense<0.000000e+00> : tensor<f32>} : (memref<f32, #gpu.address_space<global>>) -> () "lmhlo.dynamic_broadcast_in_dim"(%alloc_10, %alloca, %alloc_11) {broadcast_dimensions = dense<> : tensor<0xi64>, disc.device = "gpu"} : (memref<f32, #gpu.address_space<global>>, memref<2xindex>, memref<?x10xf32, #gpu.address_space<global>>) -> () "lmhlo.dynamic_broadcast_in_dim"(%alloc_2, %alloca, %alloc_12) {broadcast_dimensions = dense<1> : tensor<1xi64>, disc.device = "gpu"} : (memref<10xf32, #gpu.address_space<global>>, memref<2xindex>, memref<?x10xf32, #gpu.address_space<global>>) -> () "lmhlo.add"(%alloc_9, %alloc_12, %alloc_13) {disc.device = "gpu"} : (memref<?x10xf32, #gpu.address_space<global>>, memref<?x10xf32, #gpu.address_space<global>>, memref<?x10xf32, #gpu.address_space<global>>) -> () "lmhlo.maximum"(%alloc_13, %alloc_11, %alloc_14) {disc.device = "gpu"} : (memref<?x10xf32, #gpu.address_space<global>>, memref<?x10xf32, #gpu.address_space<global>>, memref<?x10xf32, #gpu.address_space<global>>) -> () "lmhlo.terminator"() : () -> () }) {disc.device = "gpu", disc.fusion.name = "main_kLoop_maximum__5_1_1", disc.fusion_type = "kLoop", disc_vectorize_or_tile_hint = 1 : i32} : () -> () } memref.dealloc %alloc_13 : memref<?x10xf32, #gpu.address_space<global>> memref.dealloc %alloc_12 : memref<?x10xf32, #gpu.address_space<global>> memref.dealloc %alloc_11 : memref<?x10xf32, #gpu.address_space<global>> memref.dealloc %alloc_10 : memref<f32, #gpu.address_space<global>> memref.dealloc %alloc_9 : memref<?x10xf32, #gpu.address_space<global>> memref.dealloc %alloc_2 : memref<10xf32, #gpu.address_space<global>> "disc_ral.dispatch"(%arg0, %c0, %alloc_14) {backend_config = "", call_target_name = "ral_send_output", device = "cpu", has_side_effect = false} : (!disc_ral.context, index, memref<?x10xf32, #gpu.address_space<global>>) -> () return }
|