-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcontent.json
1 lines (1 loc) · 452 KB
/
content.json
1
{"meta":{"title":"Don't Respond","subtitle":null,"description":null,"author":"Dou Jiang","url":"https://hjchen2.github.io","root":"/"},"pages":[{"title":"","date":"2017-03-23T03:45:32.000Z","updated":"2023-01-03T12:30:34.861Z","comments":false,"path":"index.html","permalink":"https://hjchen2.github.io/index.html","excerpt":"","text":""},{"title":"","date":"2017-03-23T03:45:32.000Z","updated":"2023-01-03T12:30:34.862Z","comments":false,"path":"categories/index.html","permalink":"https://hjchen2.github.io/categories/index.html","excerpt":"","text":""},{"title":"标签","date":"2017-03-23T03:53:23.000Z","updated":"2023-01-03T12:30:34.861Z","comments":false,"path":"tags/index.html","permalink":"https://hjchen2.github.io/tags/index.html","excerpt":"","text":""},{"title":"关于","date":"2017-03-24T07:07:18.000Z","updated":"2023-01-03T12:30:34.860Z","comments":false,"path":"about/index.html","permalink":"https://hjchen2.github.io/about/index.html","excerpt":"","text":""}],"posts":[{"title":"IREE编译流程解析(六)","slug":"IREE编译流程6","date":"2023-02-24T10:47:11.000Z","updated":"2023-03-07T06:09:20.211Z","comments":true,"path":"2023/02/24/IREE编译流程6/","link":"","permalink":"https://hjchen2.github.io/2023/02/24/IREE%E7%BC%96%E8%AF%91%E6%B5%81%E7%A8%8B6/","excerpt":"HAL::HALTransformPassPipeline的主要作用是进行tiling、vectorization和bufferization等操作,分配计算负载,最终生成target device的代码。比如cuda target的dispatch source code会被递降为NVVM IR。","text":"HAL::HALTransformPassPipeline的主要作用是进行tiling、vectorization和bufferization等操作,分配计算负载,最终生成target device的代码。比如cuda target的dispatch source code会被递降为NVVM IR。 buildHALConfigurationPassPipeline addCleanupPatterns createAssignTargetDevicesPass 在最外层的module上添加device targets属性,可以指定多个target devices。 123module attributes {hal.device.targets = [#hal.device.target<"cuda", {executable_targets = [#hal.executable.target<"cuda", "cuda-nvptx-fb", {target_arch = "sm_35"}>], legacy_sync}>]} { ...} createVerifyTargetEnvironmentPass 验证device tagets是否正确设置,以及编译后端是否被注册过。 createMaterializeInterfacesPass 为每个executable创建device target相关的变体(variant),每一种device target对应一个executable variant。将executable的export和source func都转换为无参数的func,统一dispatch、export和source func的调用接口,dispatch指定输入和bindings的关系,source func则通过binding id来获取输入参数。 12345678910111213141516171819202122232425stream.executable private @test_dispatch_0 { stream.executable.export public @test_dispatch_0_generic_100000x100 workgroups(%arg0: index, %arg1: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @test_dispatch_0_generic_100000x100(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) { ... return } }}func.func @test(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { ... %3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c40000000}, %1 as %arg3: !stream.resource<external>{%c40000000}, %2 as %arg4: !stream.resource<external>{%c400000}) { stream.cmd.fill %c0_i8, %arg4[%c0 for %c400000] : i8 -> !stream.resource<external>{%c400000} stream.cmd.dispatch @test_dispatch_0::@test_dispatch_0_generic_100000x100[%c100000, %c1] { ro %arg2[%c0 for %c40000000] : !stream.resource<external>{%c40000000}, ro %arg3[%c0 for %c40000000] : !stream.resource<external>{%c40000000}, rw %arg4[%c0 for %c400000] : !stream.resource<external>{%c400000} } } => !stream.timepoint ...} 转换为 1234567891011121314151617181920212223242526272829303132hal.executable private @test_dispatch_0 { hal.executable.variant public @cuda_nvptx_fb, target = <"cuda", "cuda-nvptx-fb", {target_arch = "sm_35"}> { hal.executable.export public @test_dispatch_0_generic_100000x100 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @test_dispatch_0_generic_100000x100() { %c0 = arith.constant 0 : index %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:tensor<100000x100xf32>> %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:tensor<100000x100xf32>> %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readwrite:tensor<100000xf32>> ... return } } }}func.func @test(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { ... %3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c40000000}, %1 as %arg3: !stream.resource<external>{%c40000000}, %2 as %arg4: !stream.resource<external>{%c400000}) { stream.cmd.fill %c0_i8, %arg4[%c0 for %c400000] : i8 -> !stream.resource<external>{%c400000} stream.cmd.dispatch @test_dispatch_0::@test_dispatch_0_generic_100000x100[%c100000, %c1] { ro %arg2[%c0 for %c40000000] : !stream.resource<external>{%c40000000}, ro %arg3[%c0 for %c40000000] : !stream.resource<external>{%c40000000}, rw %arg4[%c0 for %c400000] : !stream.resource<external>{%c400000} } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>]} } => !stream.timepoint ...} createTranslateExecutablesPass 根据每一个hal.executable.variant 的target device调用对应的后端进行编译。比如cuda会调用CUDATargetBackend,CUDATargetBackend实际执行的是下面一序列passes。 buildLLVMGPUTransformPassPipeline createTypePropagationPass 对integer的element type进行标准化,并传播修改过的type。 createBufferizeCopyOnlyDispatchesPass 将纯数据拷贝的dispatch(只有tensor load和store)转换成linalg generic op,并bufferize化。 12345678910func.func @test_dispatch_0() { %c0 = arith.constant 0 : index %0 = hal.interface.constant.load[0] : i32 %1 = arith.index_castui %0 : i32 to index %2 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:tensor<?xf32>>{%1} %3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:tensor<?xf32>>{%1} %4 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [%1], strides = [1] : !flow.dispatch.tensor<readonly:tensor<?xf32>>{%1} -> tensor<?xf32> flow.dispatch.tensor.store %4, %3, offsets = [0], sizes = [%1], strides = [1] : tensor<?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?xf32>>{%1} return} 转换成 1234567891011121314func.func @test_dispatch_0() { %c0 = arith.constant 0 : index %0 = hal.interface.constant.load[0] : i32 %1 = arith.index_castui %0 : i32 to index %2 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<?xf32, #hal.descriptor_type<storage_buffer>>{%1} memref.assume_alignment %2, 64 : memref<?xf32, #hal.descriptor_type<storage_buffer>> %3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<?xf32, #hal.descriptor_type<storage_buffer>>{%1} memref.assume_alignment %3, 64 : memref<?xf32, #hal.descriptor_type<storage_buffer>> linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%2 : memref<?xf32, #hal.descriptor_type<storage_buffer>>) outs(%3 : memref<?xf32, #hal.descriptor_type<storage_buffer>>) { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return} createEraseHALDescriptorTypeFromMemRefPass 将memory space为hal descriptor type的value转换成memref。 createLLVMGPULowerExecutableTargetPass initGPULaunchConfig 根据具体的计算负载和类型,计算gpu launch的配置,包括分块策略、group count、thread num以及后续lowering分发的流程等。 12345678910111213141516171819hal.executable.variant public @cuda_nvptx_fb, target = <"cuda", "cuda-nvptx-fb", {target_arch = "sm_35"}> { hal.executable.export public @test_dispatch_0_generic_100000x100 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @test_dispatch_0_generic_100000x100() { ... %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%3, %4 : tensor<100000x100xf32>, tensor<100000x100xf32>) outs(%5 : tensor<100000xf32>) { ^bb0(%in: f32, %in_0: f32, %out: f32): %7 = arith.addf %in, %in_0 : f32 %8 = arith.addf %7, %out : f32 linalg.yield %8 : f32 } -> tensor<100000xf32> ... } }} 转换成 12345678910111213141516171819hal.executable.variant public @cuda_nvptx_fb, target = <"cuda", "cuda-nvptx-fb", {target_arch = "sm_35"}> { hal.executable.export public @test_dispatch_0_generic_100000x100 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) attributes {translation_info = #iree_codegen.translation_info<LLVMGPUVectorize>, workgroup_size = [64 : index, 1 : index, 1 : index]} { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @test_dispatch_0_generic_100000x100() { ... %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%3, %4 : tensor<100000x100xf32>, tensor<100000x100xf32>) outs(%5 : tensor<100000xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[256, 4]]>} { ^bb0(%in: f32, %in_0: f32, %out: f32): %7 = arith.addf %in, %in_0 : f32 %8 = arith.addf %7, %out : f32 linalg.yield %8 : f32 } -> tensor<100000xf32> ... } }} 可以看到export func多了translation_info和workgroup_size两个属性,而source func也多了一个lowering_config属性。translation_info表示后续lowering分发到LLVMGPUVectorize这个pipeline。workgroup_size可以认为是3维的gpu block dim,这里表示每个线程块有64个线程。lowering_config指明了每层循环的分块策略,这里表示一个线程块计算256个100xf32的数据,而且每个线程一次计算一个4xf32的向量。 DispatchLoweringPassPipeline 根据translation_info分发到下面的pipeline继续lowering。 GPUSimpleDistributePassPipeline GPUVectorizationPassPipeline getTileAndDistributeConfig 定位到dispatch的root节点(一般是最后一个linalg reduction op,如果没有reduction op,则会选择最后一个linalg generic op),从节点属性中取出lowering_config(tile size),将非parallel loop对应的tile size置0,表示接下来只会对parallel loop进行vectorize,并计算parallel loop的loop range。 LowerDispatchWorkgroupCountForDagRootOp 根据loop range和tile size计算workgroup count。 12345hal.executable.export public @test_dispatch_0_generic_100000x100 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) attributes {translation_info = #iree_codegen.translation_info<LLVMGPUVectorize>, workgroup_size = [64 : index, 1 : index, 1 : index]} {^bb0(%arg0: !hal.device, %arg1: index, %arg2: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2 hal.return %x, %y, %z : index, index, index} 转换成 123456hal.executable.export public @test_dispatch_0_generic_100000x100 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) attributes {translation_info = #iree_codegen.translation_info<LLVMGPUVectorize>, workgroup_size = [64 : index, 1 : index, 1 : index]} {^bb0(%arg0: !hal.device, %arg1: index, %arg2: index): %c391 = arith.constant 391 : index %c1 = arith.constant 1 : index hal.return %c391, %c1, %c1 : index, index, index} 可以看到计算的group count为(391, 1, 1)。391 = UDIV(100000, 256)。 populateTileAndDistributeToWorkgroupsPatterns 对parallel loop进行分块,将source func转换成单个work group的计算逻辑。 12345678910func.func @test_dispatch_0_generic_100000x100() { ... %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%3, %4 : tensor<100000x100xf32>, tensor<100000x100xf32>) outs(%5 : tensor<100000xf32>) attrs = {__internal_linalg_transform__ = "__workgroup_tiling__", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[256, 4]]>} { ^bb0(%in: f32, %in_0: f32, %out: f32): %7 = arith.addf %in, %in_0 : f32 %8 = arith.addf %7, %out : f32 linalg.yield %8 : f32 } -> tensor<100000xf32> ...} 转换成 12345678910111213141516171819func.func @test_dispatch_0_generic_100000x100() { ... %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %3 = affine.apply affine_map<()[s0] -> (s0 * 256)>()[%workgroup_id_x] %4 = affine.apply affine_map<()[s0] -> (s0 * 256)>()[%workgroup_count_x] scf.for %arg0 = %3 to %c100000 step %4 { %5 = affine.min affine_map<(d0) -> (256, -d0 + 100000)>(%arg0) %6 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [%5, 100], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<100000x100xf32>> -> tensor<?x100xf32> %7 = flow.dispatch.tensor.load %1, offsets = [%arg0, 0], sizes = [%5, 100], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<100000x100xf32>> -> tensor<?x100xf32> %8 = flow.dispatch.tensor.load %2, offsets = [%arg0], sizes = [%5], strides = [1] : !flow.dispatch.tensor<readwrite:tensor<100000xf32>> -> tensor<?xf32> %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%6, %7 : tensor<?x100xf32>, tensor<?x100xf32>) outs(%8 : tensor<?xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[256, 4]]>} { ^bb0(%in: f32, %in_0: f32, %out: f32): %10 = arith.addf %in, %in_0 : f32 %11 = arith.addf %10, %out : f32 linalg.yield %11 : f32 } -> tensor<?xf32> ...} createWorkgroupSpecializationPass 将分块之后的计算逻辑分成固定形状和剩余部分动态形状两部分计算逻辑。 12345678910111213141516171819func.func @test_dispatch_0_generic_100000x100() { ... %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %3 = affine.apply affine_map<()[s0] -> (s0 * 256)>()[%workgroup_id_x] %4 = affine.apply affine_map<()[s0] -> (s0 * 256)>()[%workgroup_count_x] scf.for %arg0 = %3 to %c100000 step %4 { %5 = affine.min affine_map<(d0) -> (256, -d0 + 100000)>(%arg0) %6 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [%5, 100], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<100000x100xf32>> -> tensor<?x100xf32> %7 = flow.dispatch.tensor.load %1, offsets = [%arg0, 0], sizes = [%5, 100], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<100000x100xf32>> -> tensor<?x100xf32> %8 = flow.dispatch.tensor.load %2, offsets = [%arg0], sizes = [%5], strides = [1] : !flow.dispatch.tensor<readwrite:tensor<100000xf32>> -> tensor<?xf32> %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%6, %7 : tensor<?x100xf32>, tensor<?x100xf32>) outs(%8 : tensor<?xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[256, 4]]>} { ^bb0(%in: f32, %in_0: f32, %out: f32): %10 = arith.addf %in, %in_0 : f32 %11 = arith.addf %10, %out : f32 linalg.yield %11 : f32 } -> tensor<?xf32> ...} 会转换成 1234567891011121314151617181920212223242526272829303132333435363738func.func @test_dispatch_0_generic_100000x100() { ... %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %3 = affine.apply affine_map<()[s0] -> (s0 * 256)>()[%workgroup_id_x] %4 = affine.apply affine_map<()[s0] -> (s0 * 256)>()[%workgroup_count_x] scf.for %arg0 = %3 to %c100000 step %4 { %5 = affine.min affine_map<(d0) -> (-d0 + 100000, 256)>(%arg0) %c256 = arith.constant 256 : index %6 = arith.cmpi eq, %5, %c256 : index scf.if %6 { // 计算[256,100]静态形状的分块 %7 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [%c256, 100], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<100000x100xf32>> -> tensor<?x100xf32> %8 = flow.dispatch.tensor.load %1, offsets = [%arg0, 0], sizes = [%c256, 100], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<100000x100xf32>> -> tensor<?x100xf32> %9 = flow.dispatch.tensor.load %2, offsets = [%arg0], sizes = [%c256], strides = [1] : !flow.dispatch.tensor<readwrite:tensor<100000xf32>> -> tensor<?xf32> %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%7, %8 : tensor<?x100xf32>, tensor<?x100xf32>) outs(%9 : tensor<?xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[256, 4]]>} { ^bb0(%in: f32, %in_0: f32, %out: f32): %11 = arith.addf %in, %in_0 : f32 %12 = arith.addf %11, %out : f32 linalg.yield %12 : f32 } -> tensor<?xf32> flow.dispatch.tensor.store %10, %2, offsets = [%arg0], sizes = [%c256], strides = [1] : tensor<?xf32> -> !flow.dispatch.tensor<readwrite:tensor<100000xf32>> } else { // 计算剩下的[%5, 100]动态形状的分块 %7 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [%5, 100], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<100000x100xf32>> -> tensor<?x100xf32> %8 = flow.dispatch.tensor.load %1, offsets = [%arg0, 0], sizes = [%5, 100], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<100000x100xf32>> -> tensor<?x100xf32> %9 = flow.dispatch.tensor.load %2, offsets = [%arg0], sizes = [%5], strides = [1] : !flow.dispatch.tensor<readwrite:tensor<100000xf32>> -> tensor<?xf32> %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%7, %8 : tensor<?x100xf32>, tensor<?x100xf32>) outs(%9 : tensor<?xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[256, 4]]>} { ^bb0(%in: f32, %in_0: f32, %out: f32): %11 = arith.addf %in, %in_0 : f32 %12 = arith.addf %11, %out : f32 linalg.yield %12 : f32 } -> tensor<?xf32> flow.dispatch.tensor.store %10, %2, offsets = [%arg0], sizes = [%5], strides = [1] : tensor<?xf32> -> !flow.dispatch.tensor<readwrite:tensor<100000xf32>> } } return} createRemoveSingleIterationLoopPass 移除确信只会循环1次的loop。比如上面的scf.for %arg0 = %3 to %c100000 step %4就只会被循环一次,因为step = 256 * 391 = 100096 > 100000,因此这个循环会被消除,转换成如下代码。 1234567891011121314151617181920212223242526272829303132func.func @test_dispatch_0_generic_100000x100() { ... %c256 = arith.constant 256 : index %workgroup_id_x = hal.interface.workgroup.id[0] : index %3 = affine.apply affine_map<()[s0] -> (s0 * 256)>()[%workgroup_id_x] %4 = affine.min affine_map<(d0) -> (-d0 + 100000, 256)>(%3) %5 = arith.cmpi eq, %4, %c256 : index scf.if %5 { %6 = flow.dispatch.tensor.load %0, offsets = [%3, 0], sizes = [256, 100], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<100000x100xf32>> -> tensor<256x100xf32> %7 = flow.dispatch.tensor.load %1, offsets = [%3, 0], sizes = [256, 100], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<100000x100xf32>> -> tensor<256x100xf32> %8 = flow.dispatch.tensor.load %2, offsets = [%3], sizes = [256], strides = [1] : !flow.dispatch.tensor<readwrite:tensor<100000xf32>> -> tensor<256xf32> %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%6, %7 : tensor<256x100xf32>, tensor<256x100xf32>) outs(%8 : tensor<256xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[256, 4]]>} { ^bb0(%in: f32, %in_0: f32, %out: f32): %10 = arith.addf %in, %in_0 : f32 %11 = arith.addf %10, %out : f32 linalg.yield %11 : f32 } -> tensor<256xf32> flow.dispatch.tensor.store %9, %2, offsets = [%3], sizes = [256], strides = [1] : tensor<256xf32> -> !flow.dispatch.tensor<readwrite:tensor<100000xf32>> } else { %6 = flow.dispatch.tensor.load %0, offsets = [%3, 0], sizes = [%4, 100], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<100000x100xf32>> -> tensor<?x100xf32> %7 = flow.dispatch.tensor.load %1, offsets = [%3, 0], sizes = [%4, 100], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<100000x100xf32>> -> tensor<?x100xf32> %8 = flow.dispatch.tensor.load %2, offsets = [%3], sizes = [%4], strides = [1] : !flow.dispatch.tensor<readwrite:tensor<100000xf32>> -> tensor<?xf32> %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%6, %7 : tensor<?x100xf32>, tensor<?x100xf32>) outs(%8 : tensor<?xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[256, 4]]>} { ^bb0(%in: f32, %in_0: f32, %out: f32): %10 = arith.addf %in, %in_0 : f32 %11 = arith.addf %10, %out : f32 linalg.yield %11 : f32 } -> tensor<?xf32> flow.dispatch.tensor.store %9, %2, offsets = [%3], sizes = [%4], strides = [1] : tensor<?xf32> -> !flow.dispatch.tensor<readwrite:tensor<100000xf32>> } return} createLLVMGPUTileTensor 前面pass主要针对的是外层parallel loop的vectorize,生成的是一个线程块的计算逻辑,接下来继续将负载分布到每一个线程,并且对内层的reduction也做vectorize。上面的代码继续转换成如下代码, 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879func.func @test_dispatch_0_generic_100000x100() { %c100 = arith.constant 100 : index %c4 = arith.constant 4 : index %c64 = arith.constant 64 : index %c256 = arith.constant 256 : index %c0 = arith.constant 0 : index ... %workgroup_id_x = hal.interface.workgroup.id[0] : index %3 = affine.apply affine_map<()[s0] -> (s0 * 256)>()[%workgroup_id_x] %4 = affine.min affine_map<()[s0] -> (s0 * -256 + 100000, 256)>()[%workgroup_id_x] %5 = arith.cmpi eq, %4, %c256 : index scf.if %5 { %6 = flow.dispatch.tensor.load %0, offsets = [%3, 0], sizes = [256, 100], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<100000x100xf32>> -> tensor<256x100xf32> %7 = flow.dispatch.tensor.load %1, offsets = [%3, 0], sizes = [256, 100], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<100000x100xf32>> -> tensor<256x100xf32> %8 = flow.dispatch.tensor.load %2, offsets = [%3], sizes = [256], strides = [1] : !flow.dispatch.tensor<readwrite:tensor<100000xf32>> -> tensor<256xf32> // 64个线程并发计算,每个线程计算[4, 100]的分块 %9 = scf.foreach_thread (%arg0) in (%c64) shared_outs(%arg1 = %8) -> (tensor<256xf32>) { %10 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg0) %11 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg0) %12 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg0) %extracted_slice = tensor.extract_slice %6[%10, 0] [4, 100] [1, 1] : tensor<256x100xf32> to tensor<4x100xf32> %extracted_slice_0 = tensor.extract_slice %7[%11, 0] [4, 100] [1, 1] : tensor<256x100xf32> to tensor<4x100xf32> %extracted_slice_1 = tensor.extract_slice %arg1[%12] [4] [1] : tensor<256xf32> to tensor<4xf32> // 内层reduction loop的vectorize %13 = scf.for %arg2 = %c0 to %c100 step %c4 iter_args(%arg3 = %extracted_slice_1) -> (tensor<4xf32>) { %extracted_slice_2 = tensor.extract_slice %extracted_slice[0, %arg2] [4, 4] [1, 1] : tensor<4x100xf32> to tensor<4x4xf32> %extracted_slice_3 = tensor.extract_slice %extracted_slice_0[0, %arg2] [4, 4] [1, 1] : tensor<4x100xf32> to tensor<4x4xf32> %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%extracted_slice_2, %extracted_slice_3 : tensor<4x4xf32>, tensor<4x4xf32>) outs(%arg3 : tensor<4xf32>) attrs = {__internal_linalg_transform__ = "workgroup_k_tiled", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[256, 4]]>} { ^bb0(%in: f32, %in_4: f32, %out: f32): %16 = arith.addf %in, %in_4 : f32 %17 = arith.addf %16, %out : f32 linalg.yield %17 : f32 } -> tensor<4xf32> scf.yield %15 : tensor<4xf32> } %14 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg0) scf.foreach_thread.perform_concurrently { tensor.parallel_insert_slice %13 into %arg1[%14] [4] [1] : tensor<4xf32> into tensor<256xf32> } } {mapping = [#gpu.thread<x>]} flow.dispatch.tensor.store %9, %2, offsets = [%3], sizes = [256], strides = [1] : tensor<256xf32> -> !flow.dispatch.tensor<readwrite:tensor<100000xf32>> } else { %6 = flow.dispatch.tensor.load %0, offsets = [%3, 0], sizes = [%4, 100], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<100000x100xf32>> -> tensor<?x100xf32> %7 = flow.dispatch.tensor.load %1, offsets = [%3, 0], sizes = [%4, 100], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<100000x100xf32>> -> tensor<?x100xf32> %8 = flow.dispatch.tensor.load %2, offsets = [%3], sizes = [%4], strides = [1] : !flow.dispatch.tensor<readwrite:tensor<100000xf32>> -> tensor<?xf32> %dim = tensor.dim %6, %c0 : tensor<?x100xf32> // 64个线程并发计算,每个线程计算[%11, 100]的分块 %9 = scf.foreach_thread (%arg0) in (%c64) shared_outs(%arg1 = %8) -> (tensor<?xf32>) { %10 = affine.min affine_map<(d0)[s0] -> (-(d0 * (s0 ceildiv 64)) + s0, s0 ceildiv 64)>(%arg0)[%dim] %11 = affine.max affine_map<(d0) -> (0, d0)>(%10) %12 = affine.apply affine_map<(d0)[s0] -> (d0 * (s0 ceildiv 64))>(%arg0)[%dim] %13 = affine.apply affine_map<(d0)[s0] -> (d0 * (s0 ceildiv 64))>(%arg0)[%dim] %14 = affine.apply affine_map<(d0)[s0] -> (d0 * (s0 ceildiv 64))>(%arg0)[%dim] %extracted_slice = tensor.extract_slice %6[%12, 0] [%11, 100] [1, 1] : tensor<?x100xf32> to tensor<?x100xf32> %extracted_slice_0 = tensor.extract_slice %7[%13, 0] [%11, 100] [1, 1] : tensor<?x100xf32> to tensor<?x100xf32> %extracted_slice_1 = tensor.extract_slice %arg1[%14] [%11] [1] : tensor<?xf32> to tensor<?xf32> // 内层reduction loop的vectorize %15 = scf.for %arg2 = %c0 to %c100 step %c4 iter_args(%arg3 = %extracted_slice_1) -> (tensor<?xf32>) { %extracted_slice_2 = tensor.extract_slice %extracted_slice[0, %arg2] [%11, 4] [1, 1] : tensor<?x100xf32> to tensor<?x4xf32> %extracted_slice_3 = tensor.extract_slice %extracted_slice_0[0, %arg2] [%11, 4] [1, 1] : tensor<?x100xf32> to tensor<?x4xf32> %extracted_slice_4 = tensor.extract_slice %arg3[0] [%11] [1] : tensor<?xf32> to tensor<?xf32> %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%extracted_slice_2, %extracted_slice_3 : tensor<?x4xf32>, tensor<?x4xf32>) outs(%extracted_slice_4 : tensor<?xf32>) attrs = {__internal_linalg_transform__ = "workgroup_k_tiled", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[256, 4]]>} { ^bb0(%in: f32, %in_5: f32, %out: f32): %18 = arith.addf %in, %in_5 : f32 %19 = arith.addf %18, %out : f32 linalg.yield %19 : f32 } -> tensor<?xf32> %inserted_slice = tensor.insert_slice %17 into %arg3[0] [%11] [1] : tensor<?xf32> into tensor<?xf32> scf.yield %inserted_slice : tensor<?xf32> } %16 = affine.apply affine_map<(d0)[s0] -> (d0 * (s0 ceildiv 64))>(%arg0)[%dim] scf.foreach_thread.perform_concurrently { tensor.parallel_insert_slice %15 into %arg1[%16] [%11] [1] : tensor<?xf32> into tensor<?xf32> } } {mapping = [#gpu.thread<x>]} flow.dispatch.tensor.store %9, %2, offsets = [%3], sizes = [%4], strides = [1] : tensor<?xf32> -> !flow.dispatch.tensor<readwrite:tensor<100000xf32>> } return} createRemoveSingleIterationLoopPass createGPUVectorizationPass 将内层可被向量化的linalg op转换成vector op。 1234567891011%11 = scf.for %arg2 = %c0 to %c100 step %c4 iter_args(%arg3 = %extracted_slice_1) -> (tensor<4xf32>) { %extracted_slice_2 = tensor.extract_slice %extracted_slice[0, %arg2] [4, 4] [1, 1] : tensor<4x100xf32> to tensor<4x4xf32> %extracted_slice_3 = tensor.extract_slice %extracted_slice_0[0, %arg2] [4, 4] [1, 1] : tensor<4x100xf32> to tensor<4x4xf32> %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%extracted_slice_2, %extracted_slice_3 : tensor<4x4xf32>, tensor<4x4xf32>) outs(%arg3 : tensor<4xf32>) attrs = {__internal_linalg_transform__ = "workgroup_k_tiled", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[256, 4]]>} { ^bb0(%in: f32, %in_4: f32, %out: f32): %13 = arith.addf %in, %in_4 : f32 %14 = arith.addf %13, %out : f32 linalg.yield %14 : f32 } -> tensor<4xf32> scf.yield %12 : tensor<4xf32>} 转换成 1234567891011%11 = vector.transfer_read %extracted_slice_1[%c0], %cst {in_bounds = [true]} : tensor<4xf32>, vector<4xf32>%12 = scf.for %arg2 = %c0 to %c100 step %c4 iter_args(%arg3 = %11) -> (vector<4xf32>) { %extracted_slice_2 = tensor.extract_slice %extracted_slice[0, %arg2] [4, 4] [1, 1] : tensor<4x100xf32> to tensor<4x4xf32> %extracted_slice_3 = tensor.extract_slice %extracted_slice_0[0, %arg2] [4, 4] [1, 1] : tensor<4x100xf32> to tensor<4x4xf32> %14 = vector.transfer_read %extracted_slice_2[%c0, %c0], %cst {in_bounds = [true, true]} : tensor<4x4xf32>, vector<4x4xf32> %15 = vector.transfer_read %extracted_slice_3[%c0, %c0], %cst {in_bounds = [true, true]} : tensor<4x4xf32>, vector<4x4xf32> %16 = arith.addf %14, %15 : vector<4x4xf32> %17 = vector.multi_reduction <add>, %16, %arg3 [1] : vector<4x4xf32> to vector<4xf32> scf.yield %17 : vector<4xf32>}%13 = vector.transfer_write %12, %extracted_slice_1[%c0] {in_bounds = [true]} : vector<4xf32>, tensor<4xf32> addBufferizePasses 将tensor语义转换成memref语义。上面完整的source func代码会转换成如下代码: 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162func.func @test_dispatch_0_generic_100000x100() { %cst = arith.constant 0.000000e+00 : f32 %c100 = arith.constant 100 : index %c4 = arith.constant 4 : index %c64 = arith.constant 64 : index %c256 = arith.constant 256 : index %c0 = arith.constant 0 : index %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<100000x100xf32, #hal.descriptor_type<storage_buffer>> memref.assume_alignment %0, 64 : memref<100000x100xf32, #hal.descriptor_type<storage_buffer>> %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:tensor<100000x100xf32>> %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<100000x100xf32, #hal.descriptor_type<storage_buffer>> memref.assume_alignment %2, 64 : memref<100000x100xf32, #hal.descriptor_type<storage_buffer>> %3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:tensor<100000x100xf32>> %4 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<100000xf32, #hal.descriptor_type<storage_buffer>> memref.assume_alignment %4, 64 : memref<100000xf32, #hal.descriptor_type<storage_buffer>> %5 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readwrite:tensor<100000xf32>> %workgroup_id_x = hal.interface.workgroup.id[0] : index %6 = affine.apply affine_map<()[s0] -> (s0 * 256)>()[%workgroup_id_x] %7 = affine.min affine_map<()[s0] -> (s0 * -256 + 100000, 256)>()[%workgroup_id_x] %8 = arith.cmpi eq, %7, %c256 : index scf.if %8 { %subview = memref.subview %0[%6, 0] [256, 100] [1, 1] : memref<100000x100xf32, #hal.descriptor_type<storage_buffer>> to memref<256x100xf32, strided<[100, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> %subview_0 = memref.subview %2[%6, 0] [256, 100] [1, 1] : memref<100000x100xf32, #hal.descriptor_type<storage_buffer>> to memref<256x100xf32, strided<[100, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> %subview_1 = memref.subview %4[%6] [256] [1] : memref<100000xf32, #hal.descriptor_type<storage_buffer>> to memref<256xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>> scf.foreach_thread (%arg0) in (%c64) { %9 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg0) %subview_2 = memref.subview %subview_1[%9] [4] [1] : memref<256xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<4xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>> %10 = vector.transfer_read %subview_1[%9], %cst {in_bounds = [true]} : memref<256xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<4xf32> %11 = scf.for %arg1 = %c0 to %c100 step %c4 iter_args(%arg2 = %10) -> (vector<4xf32>) { %12 = vector.transfer_read %subview[%9, %arg1], %cst {in_bounds = [true, true]} : memref<256x100xf32, strided<[100, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<4x4xf32> %13 = vector.transfer_read %subview_0[%9, %arg1], %cst {in_bounds = [true, true]} : memref<256x100xf32, strided<[100, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<4x4xf32> %14 = arith.addf %12, %13 : vector<4x4xf32> %15 = vector.multi_reduction <add>, %14, %arg2 [1] : vector<4x4xf32> to vector<4xf32> scf.yield %15 : vector<4xf32> } vector.transfer_write %11, %subview_2[%c0] {in_bounds = [true]} : vector<4xf32>, memref<4xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>> } {mapping = [#gpu.thread<x>]} } else { %subview = memref.subview %0[%6, 0] [%7, 100] [1, 1] : memref<100000x100xf32, #hal.descriptor_type<storage_buffer>> to memref<?x100xf32, strided<[100, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> %subview_0 = memref.subview %2[%6, 0] [%7, 100] [1, 1] : memref<100000x100xf32, #hal.descriptor_type<storage_buffer>> to memref<?x100xf32, strided<[100, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> %subview_1 = memref.subview %4[%6] [%7] [1] : memref<100000xf32, #hal.descriptor_type<storage_buffer>> to memref<?xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>> scf.foreach_thread (%arg0) in (%c64) { %9 = affine.min affine_map<(d0)[s0] -> (-(d0 * (s0 ceildiv 64)) + s0, s0 ceildiv 64)>(%arg0)[%7] %10 = affine.max affine_map<(d0) -> (0, d0)>(%9) %11 = affine.apply affine_map<(d0)[s0] -> (d0 * (s0 ceildiv 64))>(%arg0)[%7] %subview_2 = memref.subview %subview[%11, 0] [%10, 100] [1, 1] : memref<?x100xf32, strided<[100, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x100xf32, strided<[100, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> %subview_3 = memref.subview %subview_0[%11, 0] [%10, 100] [1, 1] : memref<?x100xf32, strided<[100, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x100xf32, strided<[100, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> %subview_4 = memref.subview %subview_1[%11] [%10] [1] : memref<?xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>> scf.for %arg1 = %c0 to %c100 step %c4 { %subview_5 = memref.subview %subview_2[0, %arg1] [%10, 4] [1, 1] : memref<?x100xf32, strided<[100, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x4xf32, strided<[100, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> %subview_6 = memref.subview %subview_3[0, %arg1] [%10, 4] [1, 1] : memref<?x100xf32, strided<[100, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x4xf32, strided<[100, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%subview_5, %subview_6 : memref<?x4xf32, strided<[100, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, memref<?x4xf32, strided<[100, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_4 : memref<?xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>) attrs = {__internal_linalg_transform__ = "workgroup_k_tiled", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[256, 4]]>} { ^bb0(%in: f32, %in_7: f32, %out: f32): %12 = arith.addf %in, %in_7 : f32 %13 = arith.addf %12, %out : f32 linalg.yield %13 : f32 } } } {mapping = [#gpu.thread<x>]} } return} createLLVMGPUDistribute 将任务分配到每一个线程,source func从线程块的计算逻辑转换成每个线程的计算逻辑,即用gpu.thread_id(x, y, z)替换scf.foreach_thread。 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263func.func @test_dispatch_0_generic_100000x100() { %cst = arith.constant 0.000000e+00 : f32 %c100 = arith.constant 100 : index %c4 = arith.constant 4 : index %c64 = arith.constant 64 : index %c256 = arith.constant 256 : index %c0 = arith.constant 0 : index %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<100000x100xf32> memref.assume_alignment %0, 64 : memref<100000x100xf32> %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<100000x100xf32> memref.assume_alignment %1, 64 : memref<100000x100xf32> %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<100000xf32> memref.assume_alignment %2, 64 : memref<100000xf32> %workgroup_id_x = hal.interface.workgroup.id[0] : index %3 = affine.apply affine_map<()[s0] -> (s0 * 256)>()[%workgroup_id_x] %4 = affine.min affine_map<()[s0] -> (s0 * -256 + 100000, 256)>()[%workgroup_id_x] %5 = arith.cmpi eq, %4, %c256 : index scf.if %5 { %subview = memref.subview %0[%3, 0] [256, 100] [1, 1] : memref<100000x100xf32> to memref<256x100xf32, strided<[100, 1], offset: ?>> %subview_0 = memref.subview %1[%3, 0] [256, 100] [1, 1] : memref<100000x100xf32> to memref<256x100xf32, strided<[100, 1], offset: ?>> %subview_1 = memref.subview %2[%3] [256] [1] : memref<100000xf32> to memref<256xf32, strided<[1], offset: ?>> %c1 = arith.constant 1 : index %6 = gpu.thread_id x %7 = gpu.thread_id y %8 = gpu.thread_id z %9 = affine.apply affine_map<(d0) -> (d0 * 4)>(%6) %subview_2 = memref.subview %subview_1[%9] [4] [1] : memref<256xf32, strided<[1], offset: ?>> to memref<4xf32, strided<[1], offset: ?>> %10 = vector.transfer_read %subview_1[%9], %cst {in_bounds = [true]} : memref<256xf32, strided<[1], offset: ?>>, vector<4xf32> %11 = scf.for %arg0 = %c0 to %c100 step %c4 iter_args(%arg1 = %10) -> (vector<4xf32>) { %12 = vector.transfer_read %subview[%9, %arg0], %cst {in_bounds = [true, true]} : memref<256x100xf32, strided<[100, 1], offset: ?>>, vector<4x4xf32> %13 = vector.transfer_read %subview_0[%9, %arg0], %cst {in_bounds = [true, true]} : memref<256x100xf32, strided<[100, 1], offset: ?>>, vector<4x4xf32> %14 = arith.addf %12, %13 : vector<4x4xf32> %15 = vector.multi_reduction <add>, %14, %arg1 [1] : vector<4x4xf32> to vector<4xf32> scf.yield %15 : vector<4xf32> } vector.transfer_write %11, %subview_2[%c0] {in_bounds = [true]} : vector<4xf32>, memref<4xf32, strided<[1], offset: ?>> } else { %subview = memref.subview %0[%3, 0] [%4, 100] [1, 1] : memref<100000x100xf32> to memref<?x100xf32, strided<[100, 1], offset: ?>> %subview_0 = memref.subview %1[%3, 0] [%4, 100] [1, 1] : memref<100000x100xf32> to memref<?x100xf32, strided<[100, 1], offset: ?>> %subview_1 = memref.subview %2[%3] [%4] [1] : memref<100000xf32> to memref<?xf32, strided<[1], offset: ?>> %c1 = arith.constant 1 : index %6 = gpu.thread_id x %7 = gpu.thread_id y %8 = gpu.thread_id z %9 = affine.min affine_map<(d0)[s0] -> (-(d0 * (s0 ceildiv 64)) + s0, s0 ceildiv 64)>(%6)[%4] %10 = affine.max affine_map<(d0) -> (0, d0)>(%9) %11 = affine.apply affine_map<(d0)[s0] -> (d0 * (s0 ceildiv 64))>(%6)[%4] %subview_2 = memref.subview %subview[%11, 0] [%10, 100] [1, 1] : memref<?x100xf32, strided<[100, 1], offset: ?>> to memref<?x100xf32, strided<[100, 1], offset: ?>> %subview_3 = memref.subview %subview_0[%11, 0] [%10, 100] [1, 1] : memref<?x100xf32, strided<[100, 1], offset: ?>> to memref<?x100xf32, strided<[100, 1], offset: ?>> %subview_4 = memref.subview %subview_1[%11] [%10] [1] : memref<?xf32, strided<[1], offset: ?>> to memref<?xf32, strided<[1], offset: ?>> scf.for %arg0 = %c0 to %c100 step %c4 { %subview_5 = memref.subview %subview_2[0, %arg0] [%10, 4] [1, 1] : memref<?x100xf32, strided<[100, 1], offset: ?>> to memref<?x4xf32, strided<[100, 1], offset: ?>> %subview_6 = memref.subview %subview_3[0, %arg0] [%10, 4] [1, 1] : memref<?x100xf32, strided<[100, 1], offset: ?>> to memref<?x4xf32, strided<[100, 1], offset: ?>> linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%subview_5, %subview_6 : memref<?x4xf32, strided<[100, 1], offset: ?>>, memref<?x4xf32, strided<[100, 1], offset: ?>>) outs(%subview_4 : memref<?xf32, strided<[1], offset: ?>>) attrs = {__internal_linalg_transform__ = "workgroup_k_tiled", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[256, 4]]>} { ^bb0(%in: f32, %in_7: f32, %out: f32): %12 = arith.addf %in, %in_7 : f32 %13 = arith.addf %12, %out : f32 linalg.yield %13 : f32 } } } return} createLoopInvariantCodeMotionPass memref::createFoldMemRefAliasOpsPass createOptimizeVectorTransferPass GPUMatmulSimtPassPipeline GPUMatmulTensorCorePassPipeline GPUTransposePassPipeline GPUWarpReductionPassPipeline GPUTransformDialectPasses addLowerToLLVMGPUPasses 继续将device代码递降到affine和gpu dialect,最终转换到NVVM IR或ROCDL IR。 IREE::LinalgExt::createLinalgExtToLoopsPass 将LinalgExt op转换成loops。 createMemrefCopyToLinalgPass 将memref.copy转换成linalg generic op。 createConvertLinalgToLoopsPass 将linalg generic op转换成loops。 createPadDynamicAlloc 以pad的方式申请动态大小的内存。比如需要申请的内存大小和dim相关,%dim = affine_max(0, %src),那么这里就会以%dim = %src的最大size来申请内存。 createLowerAffinePass 将affine op(比如affine.for, affine.if and affine.apply等) 递降成更低层的arith、memref和scf op。上面完整的source func代码会转换成如下代码, 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374func.func @test_dispatch_0_generic_100000x100() { %c-1 = arith.constant -1 : index %c64 = arith.constant 64 : index %c100000 = arith.constant 100000 : index %c-256 = arith.constant -256 : index %c1 = arith.constant 1 : index %cst = arith.constant 0.000000e+00 : f32 %c100 = arith.constant 100 : index %c4 = arith.constant 4 : index %c256 = arith.constant 256 : index %c0 = arith.constant 0 : index %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<100000x100xf32> memref.assume_alignment %0, 64 : memref<100000x100xf32> %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<100000x100xf32> memref.assume_alignment %1, 64 : memref<100000x100xf32> %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<100000xf32> memref.assume_alignment %2, 64 : memref<100000xf32> %workgroup_id_x = hal.interface.workgroup.id[0] : index %3 = arith.muli %workgroup_id_x, %c-256 : index %4 = arith.addi %3, %c100000 : index %5 = arith.cmpi slt, %4, %c256 : index %6 = arith.select %5, %4, %c256 : index %7 = arith.cmpi eq, %6, %c256 : index scf.if %7 { %8 = gpu.thread_id x %9 = arith.muli %8, %c4 : index %10 = arith.muli %workgroup_id_x, %c256 : index %11 = arith.addi %9, %10 : index %12 = vector.transfer_read %2[%11], %cst {in_bounds = [true]} : memref<100000xf32>, vector<4xf32> %13 = scf.for %arg0 = %c0 to %c100 step %c4 iter_args(%arg1 = %12) -> (vector<4xf32>) { %14 = vector.transfer_read %0[%11, %arg0], %cst {in_bounds = [true, true]} : memref<100000x100xf32>, vector<4x4xf32> %15 = vector.transfer_read %1[%11, %arg0], %cst {in_bounds = [true, true]} : memref<100000x100xf32>, vector<4x4xf32> %16 = arith.addf %14, %15 : vector<4x4xf32> %17 = vector.multi_reduction <add>, %16, %arg1 [1] : vector<4x4xf32> to vector<4xf32> scf.yield %17 : vector<4xf32> } vector.transfer_write %13, %2[%11] {in_bounds = [true]} : vector<4xf32>, memref<100000xf32> } else { %8 = gpu.thread_id x %9 = arith.cmpi sle, %6, %c0 : index %10 = arith.subi %c0, %6 : index %11 = arith.subi %6, %c1 : index %12 = arith.select %9, %10, %11 : index %13 = arith.divsi %12, %c64 : index %14 = arith.subi %c0, %13 : index %15 = arith.addi %13, %c1 : index %16 = arith.select %9, %14, %15 : index %17 = arith.muli %8, %16 : index %18 = arith.muli %17, %c-1 : index %19 = arith.addi %18, %6 : index %20 = arith.cmpi slt, %19, %16 : index %21 = arith.select %20, %19, %16 : index %22 = arith.cmpi slt, %21, %c0 : index %23 = arith.select %22, %c0, %21 : index %24 = arith.muli %workgroup_id_x, %c256 : index %25 = arith.addi %17, %24 : index %subview = memref.subview %2[%25] [%23] [1] : memref<100000xf32> to memref<?xf32, strided<[1], offset: ?>> scf.for %arg0 = %c0 to %c100 step %c4 { %subview_0 = memref.subview %0[%25, %arg0] [%23, 4] [1, 1] : memref<100000x100xf32> to memref<?x4xf32, strided<[100, 1], offset: ?>> %subview_1 = memref.subview %1[%25, %arg0] [%23, 4] [1, 1] : memref<100000x100xf32> to memref<?x4xf32, strided<[100, 1], offset: ?>> scf.for %arg1 = %c0 to %23 step %c1 { scf.for %arg2 = %c0 to %c4 step %c1 { %26 = memref.load %subview_0[%arg1, %arg2] : memref<?x4xf32, strided<[100, 1], offset: ?>> %27 = memref.load %subview_1[%arg1, %arg2] : memref<?x4xf32, strided<[100, 1], offset: ?>> %28 = memref.load %subview[%arg1] : memref<?xf32, strided<[1], offset: ?>> %29 = arith.addf %26, %27 : f32 %30 = arith.addf %29, %28 : f32 memref.store %30, %subview[%arg1] : memref<?xf32, strided<[1], offset: ?>> } } } } return} arith::createConstantBufferizePass createFoldTensorExtractOpPass createLLVMGPUVectorLoweringPass 将多维vector op展开成一维的vector op。上面完整的source func代码会转换成如下代码, 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105func.func @test_dispatch_0_generic_100000x100() { %cst = arith.constant dense<0.000000e+00> : vector<4x4xf32> %c-1 = arith.constant -1 : index %c64 = arith.constant 64 : index %c100000 = arith.constant 100000 : index %c-256 = arith.constant -256 : index %c1 = arith.constant 1 : index %c100 = arith.constant 100 : index %c4 = arith.constant 4 : index %c256 = arith.constant 256 : index %c0 = arith.constant 0 : index %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<100000x100xf32> memref.assume_alignment %0, 64 : memref<100000x100xf32> %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<100000x100xf32> memref.assume_alignment %1, 64 : memref<100000x100xf32> %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<100000xf32> memref.assume_alignment %2, 64 : memref<100000xf32> %workgroup_id_x = hal.interface.workgroup.id[0] : index %3 = arith.muli %workgroup_id_x, %c-256 : index %4 = arith.addi %3, %c100000 : index %5 = arith.cmpi slt, %4, %c256 : index %6 = arith.select %5, %4, %c256 : index %7 = arith.cmpi eq, %6, %c256 : index scf.if %7 { %8 = gpu.thread_id x %9 = arith.muli %8, %c4 : index %10 = arith.muli %workgroup_id_x, %c256 : index %11 = arith.addi %9, %10 : index %12 = vector.load %2[%11] : memref<100000xf32>, vector<4xf32> %13 = scf.for %arg0 = %c0 to %c100 step %c4 iter_args(%arg1 = %12) -> (vector<4xf32>) { %14 = vector.load %0[%11, %arg0] : memref<100000x100xf32>, vector<4xf32> %15 = vector.insert %14, %cst [0] : vector<4xf32> into vector<4x4xf32> %16 = affine.apply affine_map<(d0) -> (d0 + 1)>(%11) %17 = vector.load %0[%16, %arg0] : memref<100000x100xf32>, vector<4xf32> %18 = vector.insert %17, %15 [1] : vector<4xf32> into vector<4x4xf32> %19 = affine.apply affine_map<(d0) -> (d0 + 2)>(%11) %20 = vector.load %0[%19, %arg0] : memref<100000x100xf32>, vector<4xf32> %21 = vector.insert %20, %18 [2] : vector<4xf32> into vector<4x4xf32> %22 = affine.apply affine_map<(d0) -> (d0 + 3)>(%11) %23 = vector.load %0[%22, %arg0] : memref<100000x100xf32>, vector<4xf32> %24 = vector.insert %23, %21 [3] : vector<4xf32> into vector<4x4xf32> %25 = vector.load %1[%11, %arg0] : memref<100000x100xf32>, vector<4xf32> %26 = vector.insert %25, %cst [0] : vector<4xf32> into vector<4x4xf32> %27 = affine.apply affine_map<(d0) -> (d0 + 1)>(%11) %28 = vector.load %1[%27, %arg0] : memref<100000x100xf32>, vector<4xf32> %29 = vector.insert %28, %26 [1] : vector<4xf32> into vector<4x4xf32> %30 = affine.apply affine_map<(d0) -> (d0 + 2)>(%11) %31 = vector.load %1[%30, %arg0] : memref<100000x100xf32>, vector<4xf32> %32 = vector.insert %31, %29 [2] : vector<4xf32> into vector<4x4xf32> %33 = affine.apply affine_map<(d0) -> (d0 + 3)>(%11) %34 = vector.load %1[%33, %arg0] : memref<100000x100xf32>, vector<4xf32> %35 = vector.insert %34, %32 [3] : vector<4xf32> into vector<4x4xf32> %36 = arith.addf %24, %35 : vector<4x4xf32> %37 = vector.transpose %36, [1, 0] : vector<4x4xf32> to vector<4x4xf32> %38 = vector.extract %37[0] : vector<4x4xf32> %39 = arith.addf %38, %arg1 : vector<4xf32> %40 = vector.extract %37[1] : vector<4x4xf32> %41 = arith.addf %40, %39 : vector<4xf32> %42 = vector.extract %37[2] : vector<4x4xf32> %43 = arith.addf %42, %41 : vector<4xf32> %44 = vector.extract %37[3] : vector<4x4xf32> %45 = arith.addf %44, %43 : vector<4xf32> scf.yield %45 : vector<4xf32> } vector.store %13, %2[%11] : memref<100000xf32>, vector<4xf32> } else { %8 = gpu.thread_id x %9 = arith.cmpi sle, %6, %c0 : index %10 = arith.subi %c0, %6 : index %11 = arith.subi %6, %c1 : index %12 = arith.select %9, %10, %11 : index %13 = arith.divsi %12, %c64 : index %14 = arith.subi %c0, %13 : index %15 = arith.addi %13, %c1 : index %16 = arith.select %9, %14, %15 : index %17 = arith.muli %8, %16 : index %18 = arith.muli %17, %c-1 : index %19 = arith.addi %18, %6 : index %20 = arith.cmpi slt, %19, %16 : index %21 = arith.select %20, %19, %16 : index %22 = arith.cmpi slt, %21, %c0 : index %23 = arith.select %22, %c0, %21 : index %24 = arith.muli %workgroup_id_x, %c256 : index %25 = arith.addi %17, %24 : index scf.for %arg0 = %c0 to %c100 step %c4 { scf.for %arg1 = %c0 to %23 step %c1 { scf.for %arg2 = %c0 to %c4 step %c1 { %26 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%arg1)[%25] %27 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%arg2)[%arg0] %28 = memref.load %0[%26, %27] : memref<100000x100xf32> %29 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%arg1)[%25] %30 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%arg2)[%arg0] %31 = memref.load %1[%29, %30] : memref<100000x100xf32> %32 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%arg1)[%25] %33 = memref.load %2[%32] : memref<100000xf32> %34 = arith.addf %28, %31 : f32 %35 = arith.addf %34, %33 : f32 %36 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%arg1)[%25] memref.store %35, %2[%36] : memref<100000xf32> } } } } return} createConvertSCFToCFPass 将structure的control flow转换成CFG的控制流。上面完整的source func代码会转换成如下代码, 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121func.func @test_dispatch_0_generic_100000x100() { %cst = arith.constant dense<0.000000e+00> : vector<4x4xf32> %c-1 = arith.constant -1 : index %c64 = arith.constant 64 : index %c100000 = arith.constant 100000 : index %c-256 = arith.constant -256 : index %c1 = arith.constant 1 : index %c100 = arith.constant 100 : index %c4 = arith.constant 4 : index %c256 = arith.constant 256 : index %c0 = arith.constant 0 : index %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<100000x100xf32> memref.assume_alignment %0, 64 : memref<100000x100xf32> %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<100000x100xf32> memref.assume_alignment %1, 64 : memref<100000x100xf32> %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<100000xf32> memref.assume_alignment %2, 64 : memref<100000xf32> %workgroup_id_x = hal.interface.workgroup.id[0] : index %3 = arith.muli %workgroup_id_x, %c-256 : index %4 = arith.addi %3, %c100000 : index %5 = arith.cmpi slt, %4, %c256 : index %6 = arith.select %5, %4, %c256 : index %7 = arith.cmpi eq, %6, %c256 : index cf.cond_br %7, ^bb1, ^bb5 ^bb1: // pred: ^bb0 %8 = gpu.thread_id x %9 = arith.muli %8, %c4 : index %10 = arith.muli %workgroup_id_x, %c256 : index %11 = arith.addi %9, %10 : index %12 = vector.load %2[%11] : memref<100000xf32>, vector<4xf32> cf.br ^bb2(%c0, %12 : index, vector<4xf32>) ^bb2(%13: index, %14: vector<4xf32>): // 2 preds: ^bb1, ^bb3 %15 = arith.cmpi slt, %13, %c100 : index cf.cond_br %15, ^bb3, ^bb4 ^bb3: // pred: ^bb2 %16 = vector.load %0[%11, %13] : memref<100000x100xf32>, vector<4xf32> %17 = vector.insert %16, %cst [0] : vector<4xf32> into vector<4x4xf32> %c1_0 = arith.constant 1 : index %18 = arith.addi %11, %c1_0 : index %19 = vector.load %0[%18, %13] : memref<100000x100xf32>, vector<4xf32> %20 = vector.insert %19, %17 [1] : vector<4xf32> into vector<4x4xf32> %c2 = arith.constant 2 : index %21 = arith.addi %11, %c2 : index %22 = vector.load %0[%21, %13] : memref<100000x100xf32>, vector<4xf32> %23 = vector.insert %22, %20 [2] : vector<4xf32> into vector<4x4xf32> %c3 = arith.constant 3 : index %24 = arith.addi %11, %c3 : index %25 = vector.load %0[%24, %13] : memref<100000x100xf32>, vector<4xf32> %26 = vector.insert %25, %23 [3] : vector<4xf32> into vector<4x4xf32> %27 = vector.load %1[%11, %13] : memref<100000x100xf32>, vector<4xf32> %28 = vector.insert %27, %cst [0] : vector<4xf32> into vector<4x4xf32> %29 = vector.load %1[%18, %13] : memref<100000x100xf32>, vector<4xf32> %30 = vector.insert %29, %28 [1] : vector<4xf32> into vector<4x4xf32> %31 = vector.load %1[%21, %13] : memref<100000x100xf32>, vector<4xf32> %32 = vector.insert %31, %30 [2] : vector<4xf32> into vector<4x4xf32> %33 = vector.load %1[%24, %13] : memref<100000x100xf32>, vector<4xf32> %34 = vector.insert %33, %32 [3] : vector<4xf32> into vector<4x4xf32> %35 = arith.addf %26, %34 : vector<4x4xf32> %36 = vector.transpose %35, [1, 0] : vector<4x4xf32> to vector<4x4xf32> %37 = vector.extract %36[0] : vector<4x4xf32> %38 = arith.addf %37, %14 : vector<4xf32> %39 = vector.extract %36[1] : vector<4x4xf32> %40 = arith.addf %39, %38 : vector<4xf32> %41 = vector.extract %36[2] : vector<4x4xf32> %42 = arith.addf %41, %40 : vector<4xf32> %43 = vector.extract %36[3] : vector<4x4xf32> %44 = arith.addf %43, %42 : vector<4xf32> %45 = arith.addi %13, %c4 : index cf.br ^bb2(%45, %44 : index, vector<4xf32>) ^bb4: // pred: ^bb2 vector.store %14, %2[%11] : memref<100000xf32>, vector<4xf32> cf.br ^bb12 ^bb5: // pred: ^bb0 %46 = gpu.thread_id x %47 = arith.cmpi sle, %6, %c0 : index %48 = arith.subi %c0, %6 : index %49 = arith.subi %6, %c1 : index %50 = arith.select %47, %48, %49 : index %51 = arith.divsi %50, %c64 : index %52 = arith.subi %c0, %51 : index %53 = arith.addi %51, %c1 : index %54 = arith.select %47, %52, %53 : index %55 = arith.muli %46, %54 : index %56 = arith.muli %55, %c-1 : index %57 = arith.addi %56, %6 : index %58 = arith.cmpi slt, %57, %54 : index %59 = arith.select %58, %57, %54 : index %60 = arith.cmpi slt, %59, %c0 : index %61 = arith.select %60, %c0, %59 : index %62 = arith.muli %workgroup_id_x, %c256 : index %63 = arith.addi %55, %62 : index cf.br ^bb6(%c0 : index) ^bb6(%64: index): // 2 preds: ^bb5, ^bb11 %65 = arith.cmpi slt, %64, %c100 : index cf.cond_br %65, ^bb7(%c0 : index), ^bb12 ^bb7(%66: index): // 2 preds: ^bb6, ^bb10 %67 = arith.cmpi slt, %66, %61 : index cf.cond_br %67, ^bb8(%c0 : index), ^bb11 ^bb8(%68: index): // 2 preds: ^bb7, ^bb9 %69 = arith.cmpi slt, %68, %c4 : index cf.cond_br %69, ^bb9, ^bb10 ^bb9: // pred: ^bb8 %70 = arith.addi %63, %66 : index %71 = arith.addi %64, %68 : index %72 = memref.load %0[%70, %71] : memref<100000x100xf32> %73 = memref.load %1[%70, %71] : memref<100000x100xf32> %74 = memref.load %2[%70] : memref<100000xf32> %75 = arith.addf %72, %73 : f32 %76 = arith.addf %75, %74 : f32 memref.store %76, %2[%70] : memref<100000xf32> %77 = arith.addi %68, %c1 : index cf.br ^bb8(%77 : index) ^bb10: // pred: ^bb8 %78 = arith.addi %66, %c1 : index cf.br ^bb7(%78 : index) ^bb11: // pred: ^bb7 %79 = arith.addi %64, %c4 : index cf.br ^bb6(%79 : index) ^bb12: // 2 preds: ^bb4, ^bb6 return} createPolynomialApproximationPass arith::createArithExpandOpsPass memref::createExpandOpsPass memref::createExpandStridedMetadataPass createLowerAffinePass createStripDebugInfoPass createConvertToROCDLPass或createConvertToNVVMPass 转换到ROCDL IR或NVVM IR。上面完整的source func代码会转换成如下代码, 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377llvm.func @test_dispatch_0_generic_100000x100(%arg0: !llvm.ptr<f32> {llvm.align = 16 : i32}, %arg1: !llvm.ptr<f32> {llvm.align = 16 : i32}, %arg2: !llvm.ptr<f32> {llvm.align = 16 : i32}) { %0 = llvm.mlir.constant(3 : index) : i64 %1 = llvm.mlir.constant(2 : index) : i64 %2 = llvm.mlir.constant(dense<0.000000e+00> : vector<4x4xf32>) : !llvm.array<4 x vector<4xf32>> %3 = llvm.mlir.constant(-1 : index) : i64 %4 = llvm.mlir.constant(64 : index) : i64 %5 = llvm.mlir.constant(100000 : index) : i64 %6 = llvm.mlir.constant(-256 : index) : i64 %7 = llvm.mlir.constant(1 : index) : i64 %8 = llvm.mlir.constant(100 : index) : i64 %9 = llvm.mlir.constant(4 : index) : i64 %10 = llvm.mlir.constant(256 : index) : i64 %11 = llvm.mlir.constant(0 : index) : i64 %12 = llvm.bitcast %arg0 : !llvm.ptr<f32> to !llvm.ptr<i8> %13 = llvm.getelementptr %12[%11] : (!llvm.ptr<i8>, i64) -> !llvm.ptr<i8> %14 = llvm.bitcast %13 : !llvm.ptr<i8> to !llvm.ptr<f32> %15 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> %16 = llvm.insertvalue %14, %15[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> %17 = llvm.insertvalue %14, %16[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> %18 = llvm.mlir.constant(0 : index) : i64 %19 = llvm.insertvalue %18, %17[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> %20 = llvm.mlir.constant(100000 : index) : i64 %21 = llvm.insertvalue %20, %19[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> %22 = llvm.mlir.constant(100 : index) : i64 %23 = llvm.insertvalue %22, %21[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> %24 = llvm.mlir.constant(100 : index) : i64 %25 = llvm.insertvalue %24, %23[3, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> %26 = llvm.mlir.constant(1 : index) : i64 %27 = llvm.insertvalue %26, %25[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> %28 = llvm.extractvalue %27[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> %29 = llvm.mlir.constant(0 : index) : i64 %30 = llvm.mlir.constant(63 : index) : i64 %31 = llvm.ptrtoint %28 : !llvm.ptr<f32> to i64 %32 = llvm.and %31, %30 : i64 %33 = llvm.icmp "eq" %32, %29 : i64 "llvm.intr.assume"(%33) : (i1) -> () %34 = llvm.bitcast %arg1 : !llvm.ptr<f32> to !llvm.ptr<i8> %35 = llvm.getelementptr %34[%11] : (!llvm.ptr<i8>, i64) -> !llvm.ptr<i8> %36 = llvm.bitcast %35 : !llvm.ptr<i8> to !llvm.ptr<f32> %37 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> %38 = llvm.insertvalue %36, %37[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> %39 = llvm.insertvalue %36, %38[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> %40 = llvm.mlir.constant(0 : index) : i64 %41 = llvm.insertvalue %40, %39[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> %42 = llvm.mlir.constant(100000 : index) : i64 %43 = llvm.insertvalue %42, %41[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> %44 = llvm.mlir.constant(100 : index) : i64 %45 = llvm.insertvalue %44, %43[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> %46 = llvm.mlir.constant(100 : index) : i64 %47 = llvm.insertvalue %46, %45[3, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> %48 = llvm.mlir.constant(1 : index) : i64 %49 = llvm.insertvalue %48, %47[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> %50 = llvm.extractvalue %49[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> %51 = llvm.mlir.constant(0 : index) : i64 %52 = llvm.mlir.constant(63 : index) : i64 %53 = llvm.ptrtoint %50 : !llvm.ptr<f32> to i64 %54 = llvm.and %53, %52 : i64 %55 = llvm.icmp "eq" %54, %51 : i64 "llvm.intr.assume"(%55) : (i1) -> () %56 = llvm.bitcast %arg2 : !llvm.ptr<f32> to !llvm.ptr<i8> %57 = llvm.getelementptr %56[%11] : (!llvm.ptr<i8>, i64) -> !llvm.ptr<i8> %58 = llvm.bitcast %57 : !llvm.ptr<i8> to !llvm.ptr<f32> %59 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)> %60 = llvm.insertvalue %58, %59[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)> %61 = llvm.insertvalue %58, %60[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)> %62 = llvm.mlir.constant(0 : index) : i64 %63 = llvm.insertvalue %62, %61[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)> %64 = llvm.mlir.constant(100000 : index) : i64 %65 = llvm.insertvalue %64, %63[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)> %66 = llvm.mlir.constant(1 : index) : i64 %67 = llvm.insertvalue %66, %65[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)> %68 = llvm.extractvalue %67[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)> %69 = llvm.mlir.constant(0 : index) : i64 %70 = llvm.mlir.constant(63 : index) : i64 %71 = llvm.ptrtoint %68 : !llvm.ptr<f32> to i64 %72 = llvm.and %71, %70 : i64 %73 = llvm.icmp "eq" %72, %69 : i64 "llvm.intr.assume"(%73) : (i1) -> () %74 = nvvm.read.ptx.sreg.ctaid.x : i32 %75 = llvm.sext %74 : i32 to i64 %76 = llvm.mul %75, %6 : i64 %77 = llvm.add %76, %5 : i64 %78 = llvm.icmp "slt" %77, %10 : i64 %79 = llvm.select %78, %77, %10 : i1, i64 %80 = llvm.icmp "eq" %79, %10 : i64 llvm.cond_br %80, ^bb1, ^bb5 ^bb1: // pred: ^bb0 %81 = nvvm.read.ptx.sreg.tid.x : i32 %82 = llvm.sext %81 : i32 to i64 %83 = llvm.mul %82, %9 : i64 %84 = llvm.mul %75, %10 : i64 %85 = llvm.add %83, %84 : i64 %86 = llvm.extractvalue %67[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)> %87 = llvm.getelementptr %86[%85] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32> %88 = llvm.bitcast %87 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>> %89 = llvm.load %88 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>> llvm.br ^bb2(%11, %89 : i64, vector<4xf32>) ^bb2(%90: i64, %91: vector<4xf32>): // 2 preds: ^bb1, ^bb3 %92 = llvm.icmp "slt" %90, %8 : i64 llvm.cond_br %92, ^bb3, ^bb4 ^bb3: // pred: ^bb2 %93 = llvm.extractvalue %27[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> %94 = llvm.mlir.constant(100 : index) : i64 %95 = llvm.mul %85, %94 : i64 %96 = llvm.add %95, %90 : i64 %97 = llvm.getelementptr %93[%96] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32> %98 = llvm.bitcast %97 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>> %99 = llvm.load %98 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>> %100 = llvm.insertvalue %99, %2[0] : !llvm.array<4 x vector<4xf32>> %101 = llvm.add %85, %7 : i64 %102 = llvm.extractvalue %27[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> %103 = llvm.mlir.constant(100 : index) : i64 %104 = llvm.mul %101, %103 : i64 %105 = llvm.add %104, %90 : i64 %106 = llvm.getelementptr %102[%105] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32> %107 = llvm.bitcast %106 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>> %108 = llvm.load %107 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>> %109 = llvm.insertvalue %108, %100[1] : !llvm.array<4 x vector<4xf32>> %110 = llvm.add %85, %1 : i64 %111 = llvm.extractvalue %27[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> %112 = llvm.mlir.constant(100 : index) : i64 %113 = llvm.mul %110, %112 : i64 %114 = llvm.add %113, %90 : i64 %115 = llvm.getelementptr %111[%114] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32> %116 = llvm.bitcast %115 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>> %117 = llvm.load %116 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>> %118 = llvm.insertvalue %117, %109[2] : !llvm.array<4 x vector<4xf32>> %119 = llvm.add %85, %0 : i64 %120 = llvm.extractvalue %27[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> %121 = llvm.mlir.constant(100 : index) : i64 %122 = llvm.mul %119, %121 : i64 %123 = llvm.add %122, %90 : i64 %124 = llvm.getelementptr %120[%123] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32> %125 = llvm.bitcast %124 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>> %126 = llvm.load %125 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>> %127 = llvm.insertvalue %126, %118[3] : !llvm.array<4 x vector<4xf32>> %128 = llvm.extractvalue %49[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> %129 = llvm.mlir.constant(100 : index) : i64 %130 = llvm.mul %85, %129 : i64 %131 = llvm.add %130, %90 : i64 %132 = llvm.getelementptr %128[%131] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32> %133 = llvm.bitcast %132 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>> %134 = llvm.load %133 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>> %135 = llvm.insertvalue %134, %2[0] : !llvm.array<4 x vector<4xf32>> %136 = llvm.extractvalue %49[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> %137 = llvm.mlir.constant(100 : index) : i64 %138 = llvm.mul %101, %137 : i64 %139 = llvm.add %138, %90 : i64 %140 = llvm.getelementptr %136[%139] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32> %141 = llvm.bitcast %140 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>> %142 = llvm.load %141 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>> %143 = llvm.insertvalue %142, %135[1] : !llvm.array<4 x vector<4xf32>> %144 = llvm.extractvalue %49[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> %145 = llvm.mlir.constant(100 : index) : i64 %146 = llvm.mul %110, %145 : i64 %147 = llvm.add %146, %90 : i64 %148 = llvm.getelementptr %144[%147] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32> %149 = llvm.bitcast %148 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>> %150 = llvm.load %149 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>> %151 = llvm.insertvalue %150, %143[2] : !llvm.array<4 x vector<4xf32>> %152 = llvm.extractvalue %49[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> %153 = llvm.mlir.constant(100 : index) : i64 %154 = llvm.mul %119, %153 : i64 %155 = llvm.add %154, %90 : i64 %156 = llvm.getelementptr %152[%155] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32> %157 = llvm.bitcast %156 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>> %158 = llvm.load %157 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>> %159 = llvm.insertvalue %158, %151[3] : !llvm.array<4 x vector<4xf32>> %160 = llvm.mlir.undef : !llvm.array<4 x vector<4xf32>> %161 = llvm.extractvalue %127[0] : !llvm.array<4 x vector<4xf32>> %162 = llvm.extractvalue %159[0] : !llvm.array<4 x vector<4xf32>> %163 = llvm.fadd %161, %162 : vector<4xf32> %164 = llvm.insertvalue %163, %160[0] : !llvm.array<4 x vector<4xf32>> %165 = llvm.extractvalue %127[1] : !llvm.array<4 x vector<4xf32>> %166 = llvm.extractvalue %159[1] : !llvm.array<4 x vector<4xf32>> %167 = llvm.fadd %165, %166 : vector<4xf32> %168 = llvm.insertvalue %167, %164[1] : !llvm.array<4 x vector<4xf32>> %169 = llvm.extractvalue %127[2] : !llvm.array<4 x vector<4xf32>> %170 = llvm.extractvalue %159[2] : !llvm.array<4 x vector<4xf32>> %171 = llvm.fadd %169, %170 : vector<4xf32> %172 = llvm.insertvalue %171, %168[2] : !llvm.array<4 x vector<4xf32>> %173 = llvm.extractvalue %127[3] : !llvm.array<4 x vector<4xf32>> %174 = llvm.extractvalue %159[3] : !llvm.array<4 x vector<4xf32>> %175 = llvm.fadd %173, %174 : vector<4xf32> %176 = llvm.insertvalue %175, %172[3] : !llvm.array<4 x vector<4xf32>> %177 = llvm.extractvalue %176[0] : !llvm.array<4 x vector<4xf32>> %178 = llvm.mlir.constant(0 : i64) : i64 %179 = llvm.extractelement %177[%178 : i64] : vector<4xf32> %180 = llvm.extractvalue %2[0] : !llvm.array<4 x vector<4xf32>> %181 = llvm.mlir.constant(0 : i64) : i64 %182 = llvm.insertelement %179, %180[%181 : i64] : vector<4xf32> %183 = llvm.insertvalue %182, %2[0] : !llvm.array<4 x vector<4xf32>> %184 = llvm.extractvalue %176[0] : !llvm.array<4 x vector<4xf32>> %185 = llvm.mlir.constant(1 : i64) : i64 %186 = llvm.extractelement %184[%185 : i64] : vector<4xf32> %187 = llvm.extractvalue %183[1] : !llvm.array<4 x vector<4xf32>> %188 = llvm.mlir.constant(0 : i64) : i64 %189 = llvm.insertelement %186, %187[%188 : i64] : vector<4xf32> %190 = llvm.insertvalue %189, %183[1] : !llvm.array<4 x vector<4xf32>> %191 = llvm.extractvalue %176[0] : !llvm.array<4 x vector<4xf32>> %192 = llvm.mlir.constant(2 : i64) : i64 %193 = llvm.extractelement %191[%192 : i64] : vector<4xf32> %194 = llvm.extractvalue %190[2] : !llvm.array<4 x vector<4xf32>> %195 = llvm.mlir.constant(0 : i64) : i64 %196 = llvm.insertelement %193, %194[%195 : i64] : vector<4xf32> %197 = llvm.insertvalue %196, %190[2] : !llvm.array<4 x vector<4xf32>> %198 = llvm.extractvalue %176[0] : !llvm.array<4 x vector<4xf32>> %199 = llvm.mlir.constant(3 : i64) : i64 %200 = llvm.extractelement %198[%199 : i64] : vector<4xf32> %201 = llvm.extractvalue %197[3] : !llvm.array<4 x vector<4xf32>> %202 = llvm.mlir.constant(0 : i64) : i64 %203 = llvm.insertelement %200, %201[%202 : i64] : vector<4xf32> %204 = llvm.insertvalue %203, %197[3] : !llvm.array<4 x vector<4xf32>> %205 = llvm.extractvalue %176[1] : !llvm.array<4 x vector<4xf32>> %206 = llvm.mlir.constant(0 : i64) : i64 %207 = llvm.extractelement %205[%206 : i64] : vector<4xf32> %208 = llvm.extractvalue %204[0] : !llvm.array<4 x vector<4xf32>> %209 = llvm.mlir.constant(1 : i64) : i64 %210 = llvm.insertelement %207, %208[%209 : i64] : vector<4xf32> %211 = llvm.insertvalue %210, %204[0] : !llvm.array<4 x vector<4xf32>> %212 = llvm.extractvalue %176[1] : !llvm.array<4 x vector<4xf32>> %213 = llvm.mlir.constant(1 : i64) : i64 %214 = llvm.extractelement %212[%213 : i64] : vector<4xf32> %215 = llvm.extractvalue %211[1] : !llvm.array<4 x vector<4xf32>> %216 = llvm.mlir.constant(1 : i64) : i64 %217 = llvm.insertelement %214, %215[%216 : i64] : vector<4xf32> %218 = llvm.insertvalue %217, %211[1] : !llvm.array<4 x vector<4xf32>> %219 = llvm.extractvalue %176[1] : !llvm.array<4 x vector<4xf32>> %220 = llvm.mlir.constant(2 : i64) : i64 %221 = llvm.extractelement %219[%220 : i64] : vector<4xf32> %222 = llvm.extractvalue %218[2] : !llvm.array<4 x vector<4xf32>> %223 = llvm.mlir.constant(1 : i64) : i64 %224 = llvm.insertelement %221, %222[%223 : i64] : vector<4xf32> %225 = llvm.insertvalue %224, %218[2] : !llvm.array<4 x vector<4xf32>> %226 = llvm.extractvalue %176[1] : !llvm.array<4 x vector<4xf32>> %227 = llvm.mlir.constant(3 : i64) : i64 %228 = llvm.extractelement %226[%227 : i64] : vector<4xf32> %229 = llvm.extractvalue %225[3] : !llvm.array<4 x vector<4xf32>> %230 = llvm.mlir.constant(1 : i64) : i64 %231 = llvm.insertelement %228, %229[%230 : i64] : vector<4xf32> %232 = llvm.insertvalue %231, %225[3] : !llvm.array<4 x vector<4xf32>> %233 = llvm.extractvalue %176[2] : !llvm.array<4 x vector<4xf32>> %234 = llvm.mlir.constant(0 : i64) : i64 %235 = llvm.extractelement %233[%234 : i64] : vector<4xf32> %236 = llvm.extractvalue %232[0] : !llvm.array<4 x vector<4xf32>> %237 = llvm.mlir.constant(2 : i64) : i64 %238 = llvm.insertelement %235, %236[%237 : i64] : vector<4xf32> %239 = llvm.insertvalue %238, %232[0] : !llvm.array<4 x vector<4xf32>> %240 = llvm.extractvalue %176[2] : !llvm.array<4 x vector<4xf32>> %241 = llvm.mlir.constant(1 : i64) : i64 %242 = llvm.extractelement %240[%241 : i64] : vector<4xf32> %243 = llvm.extractvalue %239[1] : !llvm.array<4 x vector<4xf32>> %244 = llvm.mlir.constant(2 : i64) : i64 %245 = llvm.insertelement %242, %243[%244 : i64] : vector<4xf32> %246 = llvm.insertvalue %245, %239[1] : !llvm.array<4 x vector<4xf32>> %247 = llvm.extractvalue %176[2] : !llvm.array<4 x vector<4xf32>> %248 = llvm.mlir.constant(2 : i64) : i64 %249 = llvm.extractelement %247[%248 : i64] : vector<4xf32> %250 = llvm.extractvalue %246[2] : !llvm.array<4 x vector<4xf32>> %251 = llvm.mlir.constant(2 : i64) : i64 %252 = llvm.insertelement %249, %250[%251 : i64] : vector<4xf32> %253 = llvm.insertvalue %252, %246[2] : !llvm.array<4 x vector<4xf32>> %254 = llvm.extractvalue %176[2] : !llvm.array<4 x vector<4xf32>> %255 = llvm.mlir.constant(3 : i64) : i64 %256 = llvm.extractelement %254[%255 : i64] : vector<4xf32> %257 = llvm.extractvalue %253[3] : !llvm.array<4 x vector<4xf32>> %258 = llvm.mlir.constant(2 : i64) : i64 %259 = llvm.insertelement %256, %257[%258 : i64] : vector<4xf32> %260 = llvm.insertvalue %259, %253[3] : !llvm.array<4 x vector<4xf32>> %261 = llvm.extractvalue %176[3] : !llvm.array<4 x vector<4xf32>> %262 = llvm.mlir.constant(0 : i64) : i64 %263 = llvm.extractelement %261[%262 : i64] : vector<4xf32> %264 = llvm.extractvalue %260[0] : !llvm.array<4 x vector<4xf32>> %265 = llvm.mlir.constant(3 : i64) : i64 %266 = llvm.insertelement %263, %264[%265 : i64] : vector<4xf32> %267 = llvm.insertvalue %266, %260[0] : !llvm.array<4 x vector<4xf32>> %268 = llvm.extractvalue %176[3] : !llvm.array<4 x vector<4xf32>> %269 = llvm.mlir.constant(1 : i64) : i64 %270 = llvm.extractelement %268[%269 : i64] : vector<4xf32> %271 = llvm.extractvalue %267[1] : !llvm.array<4 x vector<4xf32>> %272 = llvm.mlir.constant(3 : i64) : i64 %273 = llvm.insertelement %270, %271[%272 : i64] : vector<4xf32> %274 = llvm.insertvalue %273, %267[1] : !llvm.array<4 x vector<4xf32>> %275 = llvm.extractvalue %176[3] : !llvm.array<4 x vector<4xf32>> %276 = llvm.mlir.constant(2 : i64) : i64 %277 = llvm.extractelement %275[%276 : i64] : vector<4xf32> %278 = llvm.extractvalue %274[2] : !llvm.array<4 x vector<4xf32>> %279 = llvm.mlir.constant(3 : i64) : i64 %280 = llvm.insertelement %277, %278[%279 : i64] : vector<4xf32> %281 = llvm.insertvalue %280, %274[2] : !llvm.array<4 x vector<4xf32>> %282 = llvm.extractvalue %176[3] : !llvm.array<4 x vector<4xf32>> %283 = llvm.mlir.constant(3 : i64) : i64 %284 = llvm.extractelement %282[%283 : i64] : vector<4xf32> %285 = llvm.extractvalue %281[3] : !llvm.array<4 x vector<4xf32>> %286 = llvm.mlir.constant(3 : i64) : i64 %287 = llvm.insertelement %284, %285[%286 : i64] : vector<4xf32> %288 = llvm.insertvalue %287, %281[3] : !llvm.array<4 x vector<4xf32>> %289 = llvm.extractvalue %288[0] : !llvm.array<4 x vector<4xf32>> %290 = llvm.fadd %289, %91 : vector<4xf32> %291 = llvm.extractvalue %288[1] : !llvm.array<4 x vector<4xf32>> %292 = llvm.fadd %291, %290 : vector<4xf32> %293 = llvm.extractvalue %288[2] : !llvm.array<4 x vector<4xf32>> %294 = llvm.fadd %293, %292 : vector<4xf32> %295 = llvm.extractvalue %288[3] : !llvm.array<4 x vector<4xf32>> %296 = llvm.fadd %295, %294 : vector<4xf32> %297 = llvm.add %90, %9 : i64 llvm.br ^bb2(%297, %296 : i64, vector<4xf32>) ^bb4: // pred: ^bb2 %298 = llvm.extractvalue %67[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)> %299 = llvm.getelementptr %298[%85] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32> %300 = llvm.bitcast %299 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>> llvm.store %91, %300 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>> llvm.br ^bb12 ^bb5: // pred: ^bb0 %301 = nvvm.read.ptx.sreg.tid.x : i32 %302 = llvm.sext %301 : i32 to i64 %303 = llvm.icmp "sle" %79, %11 : i64 %304 = llvm.sub %11, %79 : i64 %305 = llvm.sub %79, %7 : i64 %306 = llvm.select %303, %304, %305 : i1, i64 %307 = llvm.sdiv %306, %4 : i64 %308 = llvm.sub %11, %307 : i64 %309 = llvm.add %307, %7 : i64 %310 = llvm.select %303, %308, %309 : i1, i64 %311 = llvm.mul %302, %310 : i64 %312 = llvm.mul %311, %3 : i64 %313 = llvm.add %312, %79 : i64 %314 = llvm.icmp "slt" %313, %310 : i64 %315 = llvm.select %314, %313, %310 : i1, i64 %316 = llvm.icmp "slt" %315, %11 : i64 %317 = llvm.select %316, %11, %315 : i1, i64 %318 = llvm.mul %75, %10 : i64 %319 = llvm.add %311, %318 : i64 llvm.br ^bb6(%11 : i64) ^bb6(%320: i64): // 2 preds: ^bb5, ^bb11 %321 = llvm.icmp "slt" %320, %8 : i64 llvm.cond_br %321, ^bb7(%11 : i64), ^bb12 ^bb7(%322: i64): // 2 preds: ^bb6, ^bb10 %323 = llvm.icmp "slt" %322, %317 : i64 llvm.cond_br %323, ^bb8(%11 : i64), ^bb11 ^bb8(%324: i64): // 2 preds: ^bb7, ^bb9 %325 = llvm.icmp "slt" %324, %9 : i64 llvm.cond_br %325, ^bb9, ^bb10 ^bb9: // pred: ^bb8 %326 = llvm.add %319, %322 : i64 %327 = llvm.add %320, %324 : i64 %328 = llvm.extractvalue %27[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> %329 = llvm.mlir.constant(100 : index) : i64 %330 = llvm.mul %326, %329 : i64 %331 = llvm.add %330, %327 : i64 %332 = llvm.getelementptr %328[%331] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32> %333 = llvm.load %332 : !llvm.ptr<f32> %334 = llvm.extractvalue %49[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> %335 = llvm.mlir.constant(100 : index) : i64 %336 = llvm.mul %326, %335 : i64 %337 = llvm.add %336, %327 : i64 %338 = llvm.getelementptr %334[%337] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32> %339 = llvm.load %338 : !llvm.ptr<f32> %340 = llvm.extractvalue %67[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)> %341 = llvm.getelementptr %340[%326] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32> %342 = llvm.load %341 : !llvm.ptr<f32> %343 = llvm.fadd %333, %339 : f32 %344 = llvm.fadd %343, %342 : f32 %345 = llvm.extractvalue %67[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)> %346 = llvm.getelementptr %345[%326] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32> llvm.store %344, %346 : !llvm.ptr<f32> %347 = llvm.add %324, %7 : i64 llvm.br ^bb8(%347 : i64) ^bb10: // pred: ^bb8 %348 = llvm.add %322, %7 : i64 llvm.br ^bb7(%348 : i64) ^bb11: // pred: ^bb7 %349 = llvm.add %320, %9 : i64 llvm.br ^bb6(%349 : i64) ^bb12: // 2 preds: ^bb4, ^bb6 llvm.return} createConvertToHALPass createFixupLegacySyncPass addCleanupPatterns createLinkExecutablesPass createResolveExportOrdinalsPass createMaterializeResourceCachesPass createInlineDeviceSwitchesPass createMemoizeDeviceQueriesPass addCleanupPatterns createElideRedundantCommandsPass mlir::createLowerAffinePass mlir::createConvertSCFToCFPass IREE::Util::createCombineInitializersPass addCleanupPatterns createSerializeExecutablesPass mlir::createSymbolDCEPass","categories":[{"name":"DL Compiler","slug":"DL-Compiler","permalink":"https://hjchen2.github.io/categories/DL-Compiler/"}],"tags":[{"name":"Deep Learning Compiler","slug":"Deep-Learning-Compiler","permalink":"https://hjchen2.github.io/tags/Deep-Learning-Compiler/"},{"name":"IREE","slug":"IREE","permalink":"https://hjchen2.github.io/tags/IREE/"}]},{"title":"IREE编译流程解析(五)","slug":"IREE编译流程5","date":"2023-02-13T13:57:20.000Z","updated":"2023-02-17T11:32:02.532Z","comments":true,"path":"2023/02/13/IREE编译流程5/","link":"","permalink":"https://hjchen2.github.io/2023/02/13/IREE%E7%BC%96%E8%AF%91%E6%B5%81%E7%A8%8B5/","excerpt":"IREE::Stream::StreamTransformPassPipeline 的主要作用是将program转换到stream dialect,优化变量编码方式,划分调度子图,生成异步调度策略,并实现内存规划策略。","text":"IREE::Stream::StreamTransformPassPipeline 的主要作用是将program转换到stream dialect,优化变量编码方式,划分调度子图,生成异步调度策略,并实现内存规划策略。 buildStreamTensorPassPipeline IREE::Stream::createVerifyInputPass 检查program的合法性。 IREE::Stream::createOutlineConstantsPass 将module内部的dense constant转换成global dense constant。 1234567891011func.func @test(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { %cst = arith.constant dense<[0.000000e+00, 0.00999999977, 2.000000e-02, 3.000000e-02, 4.000000e-02, 5.000000e-02, 6.000000e-02, 7.000000e-02, 8.000000e-02, 9.000000e-02]> : tensor<10xf32> %c10 = arith.constant 10 : index %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<1x10xf32> %1 = flow.tensor.reshape %0 : tensor<1x10xf32> -> tensor<10xf32> %2 = flow.tensor.empty : tensor<10xf32> %3 = flow.dispatch @test_dispatch_0::@test_dispatch_0_generic_10[%c10](%1, %cst, %2) : (tensor<10xf32>, tensor<10xf32>, tensor<10xf32>) -> %2 %4 = flow.tensor.reshape %3 : tensor<10xf32> -> tensor<1x10xf32> %5 = hal.tensor.export %4 : tensor<1x10xf32> -> !hal.buffer_view return %5 : !hal.buffer_view} 转换成, 123456789101112util.global private @_constant {noinline} = dense<[0.000000e+00, 0.00999999977, 2.000000e-02, 3.000000e-02, 4.000000e-02, 5.000000e-02, 6.000000e-02, 7.000000e-02, 8.000000e-02, 9.000000e-02]> : tensor<10xf32>func.func @test(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { %_constant = util.global.load @_constant : tensor<10xf32> %c10 = arith.constant 10 : index %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<1x10xf32> %1 = flow.tensor.reshape %0 : tensor<1x10xf32> -> tensor<10xf32> %2 = flow.tensor.empty : tensor<10xf32> %3 = flow.dispatch @test_dispatch_0::@test_dispatch_0_generic_10[%c10](%1, %_constant, %2) : (tensor<10xf32>, tensor<10xf32>, tensor<10xf32>) -> %2 %4 = flow.tensor.reshape %3 : tensor<10xf32> -> tensor<1x10xf32> %5 = hal.tensor.export %4 : tensor<1x10xf32> -> !hal.buffer_view return %5 : !hal.buffer_view} addCleanupPatterns IREE::Stream::createConvertToStreamPass 将IREE::Util、IREE::Flow、IREE::HAL以及std dialect转换到IREE::Stream dialect。 12345678910111213141516171819202122232425262728293031323334module { util.global private @_constant {noinline} = dense<[0.000000e+00, 0.00999999977, 2.000000e-02, 3.000000e-02, 4.000000e-02, 5.000000e-02, 6.000000e-02, 7.000000e-02, 8.000000e-02, 9.000000e-02]> : tensor<10xf32> flow.executable private @test_dispatch_0 { flow.executable.export public @test_dispatch_0_generic_10 workgroups(%arg0: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0 flow.return %x, %y, %z : index, index, index } builtin.module { func.func @test_dispatch_0_generic_10(%arg0: !flow.dispatch.tensor<readonly:tensor<10xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<10xf32>>, %arg2: !flow.dispatch.tensor<readwrite:tensor<10xf32>>) { %0 = flow.dispatch.tensor.load %arg0, offsets = [0], sizes = [10], strides = [1] : !flow.dispatch.tensor<readonly:tensor<10xf32>> -> tensor<10xf32> %1 = flow.dispatch.tensor.load %arg1, offsets = [0], sizes = [10], strides = [1] : !flow.dispatch.tensor<readonly:tensor<10xf32>> -> tensor<10xf32> %2 = flow.dispatch.tensor.load %arg2, offsets = [0], sizes = [10], strides = [1] : !flow.dispatch.tensor<readwrite:tensor<10xf32>> -> tensor<10xf32> %3 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0, %1 : tensor<10xf32>, tensor<10xf32>) outs(%2 : tensor<10xf32>) { ^bb0(%in: f32, %in_0: f32, %out: f32): %4 = arith.addf %in, %in_0 : f32 linalg.yield %4 : f32 } -> tensor<10xf32> flow.dispatch.tensor.store %3, %arg2, offsets = [0], sizes = [10], strides = [1] : tensor<10xf32> -> !flow.dispatch.tensor<readwrite:tensor<10xf32>> return } } } func.func @test(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { %c10 = arith.constant 10 : index %_constant = util.global.load @_constant : tensor<10xf32> %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<1x10xf32> %1 = flow.tensor.reshape %0 : tensor<1x10xf32> -> tensor<10xf32> %2 = flow.tensor.empty : tensor<10xf32> %3 = flow.dispatch @test_dispatch_0::@test_dispatch_0_generic_10[%c10](%1, %_constant, %2) : (tensor<10xf32>, tensor<10xf32>, tensor<10xf32>) -> %2 %4 = flow.tensor.reshape %3 : tensor<10xf32> -> tensor<1x10xf32> %5 = hal.tensor.export %4 : tensor<1x10xf32> -> !hal.buffer_view return %5 : !hal.buffer_view }} 转换为 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960module { util.global private @_constant : !stream.resource<constant> util.global private @_constant__size : index util.initializer { %cst = stream.tensor.constant : tensor<10xf32> in !stream.resource<constant> = dense<[0.000000e+00, 0.00999999977, 2.000000e-02, 3.000000e-02, 4.000000e-02, 5.000000e-02, 6.000000e-02, 7.000000e-02, 8.000000e-02, 9.000000e-02]> : tensor<10xf32> %0 = stream.resource.size %cst : !stream.resource<constant> util.global.store %cst, @_constant : !stream.resource<constant> util.global.store %0, @_constant__size : index util.initializer.return } stream.executable private @test_dispatch_0 { stream.executable.export public @test_dispatch_0_generic_10 workgroups(%arg0: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @test_dispatch_0_generic_10(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10xf32>> %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10xf32>> %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<10xf32>> %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [10], strides = [1] : !flow.dispatch.tensor<readonly:tensor<10xf32>> -> tensor<10xf32> %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [10], strides = [1] : !flow.dispatch.tensor<readonly:tensor<10xf32>> -> tensor<10xf32> %5 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [10], strides = [1] : !flow.dispatch.tensor<readwrite:tensor<10xf32>> -> tensor<10xf32> %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3, %4 : tensor<10xf32>, tensor<10xf32>) outs(%5 : tensor<10xf32>) { ^bb0(%in: f32, %in_0: f32, %out: f32): %7 = arith.addf %in, %in_0 : f32 linalg.yield %7 : f32 } -> tensor<10xf32> flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [10], strides = [1] : tensor<10xf32> -> !flow.dispatch.tensor<readwrite:tensor<10xf32>> return } } } func.func @test(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { %c10 = arith.constant 10 : index %_constant = util.global.load @_constant : !stream.resource<constant> %_constant__size = util.global.load @_constant__size : index %0 = stream.async.transfer %_constant : !stream.resource<constant>{%_constant__size} -> !stream.resource<*>{%_constant__size} %c553648160_i32 = arith.constant 553648160 : i32 %c1_i32 = arith.constant 1 : i32 %c1 = arith.constant 1 : index %c10_0 = arith.constant 10 : index hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c1, %c10_0]) type(%c553648160_i32) encoding(%c1_i32) %1 = stream.tensor.sizeof tensor<1x10xf32> : index %2 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<1x10xf32> in !stream.resource<external>{%1} %3 = stream.async.transfer %2 : !stream.resource<external>{%1} -> !stream.resource<*>{%1} %4 = stream.tensor.sizeof tensor<10xf32> : index %5 = stream.tensor.clone %3 : tensor<1x10xf32> in !stream.resource<*>{%1} -> tensor<10xf32> in !stream.resource<*>{%4} %6 = stream.tensor.sizeof tensor<10xf32> : index %empty = stream.tensor.empty : tensor<10xf32> in !stream.resource<*>{%6} %c0 = arith.constant 0 : index %7 = stream.async.dispatch @test_dispatch_0::@test_dispatch_0_generic_10[%c10](%5[%c0 to %4 for %4], %0[%c0 to %_constant__size for %_constant__size], %empty[%c0 to %6 for %6]) : (!stream.resource<*>{%4}, !stream.resource<*>{%_constant__size}, !stream.resource<*>{%6}) -> %empty{%6} %8 = stream.tensor.sizeof tensor<1x10xf32> : index %9 = stream.tensor.clone %7 : tensor<10xf32> in !stream.resource<*>{%6} -> tensor<1x10xf32> in !stream.resource<*>{%8} %10 = stream.async.transfer %9 : !stream.resource<*>{%8} -> !stream.resource<external>{%8} %11 = stream.tensor.export %10 : tensor<1x10xf32> in !stream.resource<external>{%8} -> !hal.buffer_view return %11 : !hal.buffer_view }} 可以看到除了flow.executable,module中tensor type都被转换成stream.resource和index,但hal.buffer_view type仍然被保留。初始值为tensor的util.global constant被转换为不带初始值的 stream.resource和index,同时生成了一个util.initializer对 stream.resource和index进行初始化。 util.global.load被转换成util.global.load + stream.async.transfer,hal.tensor.import被转换成stream.tensor.import + stream.async.transfer,hal.tensor.export被转换为stream.async.transfer + stream.tensor.export,flow.tensor.reshape被转换成stream.tensor.clone,flow.executable转换为stream.executable,内部的flow.executable.export转换为stream.executable.export ,内部的func op的argument由flow.dispatch.tensor转换为stream.binding。 IREE::Stream::createVerifyLoweringToTensorsPass 检查program的合法性。 addCleanupPatterns IREE::Util::createCombineInitializersPass 合并所有的util.initializer ops。 buildStreamAsyncPassPipeline IREE::Stream::createEncodeHostTensorsPass 主要作用是将tensor的元素位宽(bit)扩充为2的幂大小,并按字节对齐。其中i1~i7转换为i8(1 byte),i9~i15转换为i16 (2 bytes),i17~i31转换为i32 (4 bytes),i33~i63转换为i64(8 bytes)。 1234567util.initializer { %cst = stream.tensor.constant : tensor<10xi4> in !stream.resource<constant> = dense<[0, 1, 2, 3, 4, 5, 6, 7, -8, -7]> : tensor<10xi4> %0 = stream.resource.size %cst : !stream.resource<constant> util.global.store %cst, @_constant : !stream.resource<constant> util.global.store %0, @_constant__size : index util.initializer.return} 转换为 1234567util.initializer { %c10 = arith.constant 10 : index %cst = stream.async.constant : !stream.resource<constant>{%c10} = dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]> : tensor<10xi8> util.global.store %cst, @_constant : !stream.resource<constant> util.global.store %c10, @_constant__size : index util.initializer.return} %cst的类型从i4转成了i8,此外stream.tensor.constant转换成了stream.async.constant,%0 = stream.resource.size %cst : !stream.resource<constant>直接被替换成了常量%c10。 IREE::Stream::createEncodeDeviceTensorsPass 和createEncodeHostTensorsPass作用一样,区别是createEncodeDeviceTensorsPass作用的是stream.executable中的op。 123456789101112131415161718builtin.module { func.func @test_dispatch_0_generic_10(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10xi4>> %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10xi4>> %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<10xi4>> %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [10], strides = [1] : !flow.dispatch.tensor<readonly:tensor<10xi4>> -> tensor<10xi4> %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [10], strides = [1] : !flow.dispatch.tensor<readonly:tensor<10xi4>> -> tensor<10xi4> %5 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [10], strides = [1] : !flow.dispatch.tensor<readwrite:tensor<10xi4>> -> tensor<10xi4> %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3, %4 : tensor<10xi4>, tensor<10xi4>) outs(%5 : tensor<10xi4>) { ^bb0(%in: i4, %in_0: i4, %out: i4): %7 = arith.addi %in, %in_0 : i4 linalg.yield %7 : i4 } -> tensor<10xi4> flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [10], strides = [1] : tensor<10xi4> -> !flow.dispatch.tensor<readwrite:tensor<10xi4>> return }} 转换为, 12345678910111213141516171819202122builtin.module { func.func @test_dispatch_0_generic_10(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10xi8>> %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10xi8>> %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<10xi8>> %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [10], strides = [1] : !flow.dispatch.tensor<readonly:tensor<10xi8>> -> tensor<10xi8> %4 = arith.trunci %3 : tensor<10xi8> to tensor<10xi4> %5 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [10], strides = [1] : !flow.dispatch.tensor<readonly:tensor<10xi8>> -> tensor<10xi8> %6 = arith.trunci %5 : tensor<10xi8> to tensor<10xi4> %7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [10], strides = [1] : !flow.dispatch.tensor<readwrite:tensor<10xi8>> -> tensor<10xi8> %8 = arith.trunci %7 : tensor<10xi8> to tensor<10xi4> %9 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%4, %6 : tensor<10xi4>, tensor<10xi4>) outs(%8 : tensor<10xi4>) { ^bb0(%in: i4, %in_0: i4, %out: i4): %11 = arith.addi %in, %in_0 : i4 linalg.yield %11 : i4 } -> tensor<10xi4> %10 = arith.extui %9 : tensor<10xi4> to tensor<10xi8> flow.dispatch.tensor.store %10, %2, offsets = [0], sizes = [10], strides = [1] : tensor<10xi8> -> !flow.dispatch.tensor<readwrite:tensor<10xi8>> return }} 可以看到stream.binding.subspan的result type从i4转换成了i8,并且在flow.dispatch.tensor.load之后插入了一个arith.trunci,将i8截断为i4,进而参与linalg.generic中的计算。 IREE::Stream::createMaterializeBuiltinsPass addCleanupPatterns IREE::Stream::createMaterializeCopyOnWritePass 写入时插入一次拷贝,以更有效地支持inplace更新,并且确保正确的执行语义。 IREE::Stream::createElideAsyncCopiesPass 消除MaterializeCopyOnWritePass中插入的不必要的拷贝。 mlir::createCanonicalizerPass IREE::Stream::createEmplaceAllocationsPass 尝试消除stream.async.dispatch后的stream.async.update op。当stream.async.dispatch的结果没有绑定一个value时,就可以把stream.async.update的target绑定到stream.async.dispatch的结果,使得stream.async.dispatch直接把计算结果更新到target。 IREE::Stream::createRefineUsagePass 确定每个stream.resource的生命期,推导stream.resource的类型。stream.resource类型包括: Unknown: stream.resource<*> External:stream.resource<external> 由外部程序管理的内存 Staging:stream.resource<staging> 用于上传/下载的暂存缓冲区 Transient:stream.resource<transient> 跨stream的一段临时值 Variable:stream.resource<variable> 跨stream的一段持续值 Constant:stream.resource<constant> 整个程序中持续存在的立即值(常量)。 除此之外还消除了冗余的stream.async.transfer。 1234567891011121314func.func @test(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { %c40 = arith.constant 40 : index %c0 = arith.constant 0 : index %c10 = arith.constant 10 : index %c553648160_i32 = arith.constant 553648160 : i32 %c1_i32 = arith.constant 1 : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10]) type(%c553648160_i32) encoding(%c1_i32) %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10xf32> in !stream.resource<external>{%c40} %1 = stream.async.transfer %0 : !stream.resource<external>{%c40} -> !stream.resource<*>{%c40} %2 = stream.async.dispatch @test_dispatch_0::@test_dispatch_0_generic_10[%c10](%1[%c0 to %c40 for %c40]) : (!stream.resource<*>{%c40}) -> !stream.resource<*>{%c40} %3 = stream.async.transfer %2 : !stream.resource<*>{%c40} -> !stream.resource<external>{%c40} %4 = stream.tensor.export %3 : tensor<10xf32> in !stream.resource<external>{%c40} -> !hal.buffer_view return %4 : !hal.buffer_view} 转换为 123456789101112func.func @test(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { %c40 = arith.constant 40 : index %c0 = arith.constant 0 : index %c10 = arith.constant 10 : index %c553648160_i32 = arith.constant 553648160 : i32 %c1_i32 = arith.constant 1 : i32 hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10]) type(%c553648160_i32) encoding(%c1_i32) %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10xf32> in !stream.resource<external>{%c40} %1 = stream.async.dispatch @test_dispatch_0::@test_dispatch_0_generic_10[%c10](%0[%c0 to %c40 for %c40]) : (!stream.resource<external>{%c40}) -> !stream.resource<external>{%c40} %2 = stream.tensor.export %1 : tensor<10xf32> in !stream.resource<external>{%c40} -> !hal.buffer_view return %2 : !hal.buffer_view} 可以看到!stream.resource<*>{ %c40}被推导为!stream.resource<external>{ %c40},并且有两处stream.async.transfer被删除了。 addCleanupPatterns IREE::Stream::createScheduleExecutionPass 根据启发式算法将每个callable(包括util.initializer)划分成多个part进行调度,每个part独立构成一个stream.async.execute,并且每个stream.async.execute后面都跟了一个stream.timepoint.await操作用于同步执行结果。 123456789101112131415func.func @test(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { %c40 = arith.constant 40 : index %c10 = arith.constant 10 : index %c553648160_i32 = arith.constant 553648160 : i32 %c1_i32 = arith.constant 1 : i32 %c1 = arith.constant 1 : index %c0 = arith.constant 0 : index %_constant = util.global.load @_constant : !stream.resource<constant> hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c1, %c10]) type(%c553648160_i32) encoding(%c1_i32) %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<1x10xf32> in !stream.resource<external>{%c40} %1 = stream.async.alloca : !stream.resource<external>{%c40} %2 = stream.async.dispatch @test_dispatch_0::@test_dispatch_0_generic_10[%c10](%0[%c0 to %c40 for %c40], %_constant[%c0 to %c40 for %c40], %1[%c0 to %c40 for %c40]) : (!stream.resource<external>{%c40}, !stream.resource<constant>{%c40}, !stream.resource<external>{%c40}) -> %1{%c40} %3 = stream.tensor.export %2 : tensor<1x10xf32> in !stream.resource<external>{%c40} -> !hal.buffer_view return %3 : !hal.buffer_view} 转换成, 12345678910111213141516171819func.func @test(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { %c40 = arith.constant 40 : index %c10 = arith.constant 10 : index %c553648160_i32 = arith.constant 553648160 : i32 %c1_i32 = arith.constant 1 : i32 %c1 = arith.constant 1 : index %c0 = arith.constant 0 : index %_constant = util.global.load @_constant : !stream.resource<constant> hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c1, %c10]) type(%c553648160_i32) encoding(%c1_i32) %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<1x10xf32> in !stream.resource<external>{%c40} %results, %result_timepoint = stream.async.execute with(%0 as %arg1: !stream.resource<external>{%c40}, %_constant as %arg2: !stream.resource<constant>{%c40}) -> !stream.resource<external>{%c40} { %3 = stream.async.alloca : !stream.resource<external>{%c40} %4 = stream.async.dispatch @test_dispatch_0::@test_dispatch_0_generic_10[%c10](%arg1[%c0 to %c40 for %c40], %arg2[%c0 to %c40 for %c40], %3[%c0 to %c40 for %c40]) : (!stream.resource<external>{%c40}, !stream.resource<constant>{%c40}, !stream.resource<external>{%c40}) -> %3{%c40} stream.yield %4 : !stream.resource<external>{%c40} } => !stream.timepoint %1 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c40} %2 = stream.tensor.export %1 : tensor<1x10xf32> in !stream.resource<external>{%c40} -> !hal.buffer_view return %2 : !hal.buffer_view} 注意:该示例中只有一个part。 IREE::Stream::createScheduleConcurrencyPass 继续将stream.async.execute划分为多个并行调度区,每个并行调度区构成一个stream.async.concurrent 。 IREE::Stream::createPropagateTimepointsPass 给stream.resource 绑定一个 stream.timepoint,在代码中用stream.resource + stream.timepoint 的pair 替换原来的stream.resource,并在需要的地方插入await。 util.global 1util.global private @_constant : !stream.resource<constant> 转换成 12util.global private mutable @_constant__timepoint = #stream.timepoint<immediate> : !stream.timepointutil.global private @_constant : !stream.resource<constant> util.global.load 1%_constant = util.global.load @_constant : !stream.resource<constant> 转换成 123%_constant__timepoint = util.global.load @_constant__timepoint : !stream.timepoint%_constant = util.global.load @_constant : !stream.resource<constant>%0 = stream.timepoint.await %_constant__timepoint => %_constant : !stream.resource<constant>{%c40} util.global.store 1util.global.store %0, @_constant : !stream.resource<constant> 转换成 12util.global.store %result_timepoint, @_constant__timepoint : !stream.timepointutil.global.store %results, @_constant : !stream.resource<constant> func.func 123func.func @foo(%0: !stream.resource) { ...} 转换成 1234func.func @foo(%t: !stream.timepoint, %0: !stream.resource) { %1 = stream.timepoint.await %t, %0 ...} call 由于func内部已经插入了await,因此call之前的冗余await可以删除,call之后需要再插入一个func返回值的await。 12%1 = stream.timepoint.await %t, %0%r = call @foo(%1) 转换成 12%rt, %r = call @foo(%t, %0)stream.timepoint.await %rt, %t return 12%1 = stream.timepoint.await %t, %0return %1 转换成 1return %t, %0 branch 将参数的await挪到branch里面。 12345%1 = stream.timepoint.await %t, %0br ^bb1(%1)^bb1(%b): ... 转换成 123br ^bb1(%t, %0)^bb1(%a, %b): %1 = stream.timepoint.await %a, %b stream.async.execute 为每个未绑定stream.timepoint的输入参数绑定一个stream.timepoint,并在stream.async.execute之前计算参数的最大timepoint,stream.async.execute 则await这个最大timepoint。 123%results, %result_timepoint = stream.async.execute with(%0 as %arg1: !stream.resource<external>{%c40}, %_constant as %arg2: !stream.resource<constant>{%c40}) -> !stream.resource<external>{%c40} { ...} 转换成 1234%3 = stream.timepoint.join max(%2, %_constant__timepoint) => !stream.timepoint%results, %result_timepoint = stream.async.execute await(%3) => with(%1 as %arg1: !stream.resource<external>{%c40}, %_constant as %arg2: !stream.resource<constant>{%c40}) -> !stream.resource<external>{%c40} { ... } addCleanupPatterns IREE::Stream::createVerifyLoweringToAsyncPass 验证LoweringToAsyncPass阶段program的合法性。 buildStreamCmdPassPipeline IREE::Stream::createScheduleAllocationPas 首先将所有常量op聚合成一个stream.resource.constants,并移出该region,stream.resource.constants的结果会被append到该region的输入参数中(原本直接yield的常量除外)。 1234%results, %result_timepoint = stream.async.execute with() -> !stream.resource<constant>{%c40} { %cst = stream.async.constant : !stream.resource<constant>{%c40} = dense<[0.000000e+00, 0.00999999977, 2.000000e-02, 3.000000e-02, 4.000000e-02, 5.000000e-02, 6.000000e-02, 7.000000e-02, 8.000000e-02, 9.000000e-02]> : tensor<10xf32> stream.yield %cst : !stream.resource<constant>{%c40}} => !stream.timepoint 转换成 123456%results, %result_timepoint = stream.resource.constants : !stream.resource<constant>{%c40} = dense<[0.000000e+00, 0.00999999977, 2.000000e-02, 3.000000e-02, 4.000000e-02, 5.000000e-02, 6.000000e-02, 7.000000e-02, 8.000000e-02, 9.000000e-02]> : tensor<10xf32> => !stream.timepoint%0 = stream.cmd.execute with() {} => !stream.timepoint%1 = stream.timepoint.join max(%result_timepoint, %0) => !stream.timepoint 分析stream.async.execute region中resource的类型和他们之间的alias关系,按照resource的类型统一分配空间。对于没有被Tied到输入(即非inplace)的results,会统一在region外面由stream.resource.alloc申请一段external空间,region再通过Tied的方式消费alloc的结果。对于中间临时的resource,经过stream.resource.pack计算需要分配的空间大小后统一由stream.resource.alloca申请一段transient空间,并会在region后面插入stream.resource.dealloca释放申请的临时空间。 12345678910111213141516171819202122232425func.func @predict(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { %c8 = arith.constant 8 : index %c40 = arith.constant 40 : index %c4 = arith.constant 4 : index %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %c10 = arith.constant 10 : index %c553648160_i32 = arith.constant 553648160 : i32 %c1_i32 = arith.constant 1 : i32 %c2 = arith.constant 2 : index hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c1, %c2]) type(%c553648160_i32) encoding(%c1_i32) %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<1x2xf32> in !stream.resource<external>{%c8} // stream.async.execute %results, %result_timepoint = stream.async.execute with(%0 as %arg1: !stream.resource<external>{%c8}) -> !stream.resource<external>{%c40} { %3 = stream.async.dispatch @predict_dispatch_0::@predict_dispatch_0_matmul_1x10x2[%c1, %c10](%arg1[%c0 to %c8 for %c8]) : (!stream.resource<external>{%c8}) -> !stream.resource<transient>{%c40} %4 = stream.async.dispatch @predict_dispatch_1::@predict_dispatch_1_generic_10[%c1](%3[%c0 to %c40 for %c40]) : (!stream.resource<transient>{%c40}) -> !stream.resource<transient>{%c4} %5 = stream.async.dispatch @predict_dispatch_2::@predict_dispatch_2_generic_1x10[%c1, %c10](%3[%c0 to %c40 for %c40], %4[%c0 to %c4 for %c4]) : (!stream.resource<transient>{%c40}, !stream.resource<transient>{%c4}) -> !stream.resource<transient>{%c40} %6 = stream.async.dispatch @predict_dispatch_3::@predict_dispatch_3_generic_10[%c1](%5[%c0 to %c40 for %c40]) : (!stream.resource<transient>{%c40}) -> !stream.resource<transient>{%c4} %7 = stream.async.dispatch @predict_dispatch_4::@predict_dispatch_4_generic_1x10[%c1, %c10](%5[%c0 to %c40 for %c40], %6[%c0 to %c4 for %c4]) : (!stream.resource<transient>{%c40}, !stream.resource<transient>{%c4}) -> !stream.resource<external>{%c40} stream.yield %7 : !stream.resource<external>{%c40} } => !stream.timepoint %1 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c40} %2 = stream.tensor.export %1 : tensor<1x10xf32> in !stream.resource<external>{%c40} -> !hal.buffer_view return %2 : !hal.buffer_view} 转换成, 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455func.func @predict(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { %c8 = arith.constant 8 : index %c40 = arith.constant 40 : index %c4 = arith.constant 4 : index %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %c10 = arith.constant 10 : index %c553648160_i32 = arith.constant 553648160 : i32 %c1_i32 = arith.constant 1 : i32 %c2 = arith.constant 2 : index hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c1, %c2]) type(%c553648160_i32) encoding(%c1_i32) %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<1x2xf32> in !stream.resource<external>{%c8} %c0_0 = arith.constant 0 : index // 申请输出resource的空间 %1 = stream.resource.alloc uninitialized : !stream.resource<external>{%c40} // 计算临时resource所需要的空间大小 %2:5 = stream.resource.pack slices({ [0, 2] = %c40, // [0, 2]是某个resource的lifetime,%40是resource size [1, 2] = %c4, [2, 4] = %c40, [3, 4] = %c4 }) : index // 申请临时resource的空间 %result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<transient>{%2#0} => !stream.timepoint %3 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg1: !stream.resource<external>{%c8}, %1 as %arg2: !stream.resource<external>{%c40}, %result as %arg3: !stream.resource<transient>{%2#0}) { stream.cmd.dispatch @predict_dispatch_0::@predict_dispatch_0_matmul_1x10x2[%c1, %c10] { ro %arg1[%c0 for %c8] : !stream.resource<external>{%c8}, wo %arg3[%2#1 for %c40] : !stream.resource<transient>{%2#0} } stream.cmd.dispatch @predict_dispatch_1::@predict_dispatch_1_generic_10[%c1] { ro %arg3[%2#1 for %c40] : !stream.resource<transient>{%2#0}, wo %arg3[%2#2 for %c4] : !stream.resource<transient>{%2#0} } stream.cmd.dispatch @predict_dispatch_2::@predict_dispatch_2_generic_1x10[%c1, %c10] { ro %arg3[%2#1 for %c40] : !stream.resource<transient>{%2#0}, ro %arg3[%2#2 for %c4] : !stream.resource<transient>{%2#0}, wo %arg3[%2#3 for %c40] : !stream.resource<transient>{%2#0} } stream.cmd.dispatch @predict_dispatch_3::@predict_dispatch_3_generic_10[%c1] { ro %arg3[%2#3 for %c40] : !stream.resource<transient>{%2#0}, wo %arg3[%2#4 for %c4] : !stream.resource<transient>{%2#0} } stream.cmd.dispatch @predict_dispatch_4::@predict_dispatch_4_generic_1x10[%c1, %c10] { ro %arg3[%2#3 for %c40] : !stream.resource<transient>{%2#0}, ro %arg3[%2#4 for %c4] : !stream.resource<transient>{%2#0}, wo %arg2[%c0_0 for %c40] : !stream.resource<external>{%c40} } } => !stream.timepoint // 释放申请的临时空间 %4 = stream.resource.dealloca await(%3) => %result : !stream.resource<transient>{%2#0} => !stream.timepoint %5 = stream.timepoint.join max(%4, %3) => !stream.timepoint %6 = stream.timepoint.await %5 => %1 : !stream.resource<external>{%c40} %7 = stream.tensor.export %6 : tensor<1x10xf32> in !stream.resource<external>{%c40} -> !hal.buffer_view return %7 : !hal.buffer_view} IREE::Stream::createPackConstantsPass 将stream.resource.constants的结果根据lifetime类型分成Constant和Variable两种,每一种都替换成一个util.buffer.constant 。 123456789101112util.initializer { %c40 = arith.constant 40 : index %results, %result_timepoint = stream.resource.constants : !stream.resource<constant>{%c40} = dense<[0.000000e+00, 0.00999999977, 2.000000e-02, 3.000000e-02, 4.000000e-02, 5.000000e-02, 6.000000e-02, 7.000000e-02, 8.000000e-02, 9.000000e-02]> : tensor<10xf32> => !stream.timepoint %0 = stream.cmd.execute with() { } => !stream.timepoint %1 = stream.timepoint.join max(%result_timepoint, %0) => !stream.timepoint util.global.store %results, @_constant : !stream.resource<constant> util.global.store %1, @_constant__timepoint : !stream.timepoint util.initializer.return} 转换成, 1234567891011121314151617181920212223242526272829303132util.initializer { %c40 = arith.constant 40 : index %buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<64xi8, [ dense<[0.000000e+00, 0.00999999977, 2.000000e-02, 3.000000e-02, 4.000000e-02, 5.000000e-02, 6.000000e-02, 7.000000e-02, 8.000000e-02, 9.000000e-02]> : tensor<10xf32>, dense<0> : vector<24xi8>, // 填充的无用数据]> %c0 = arith.constant 0 : index %c64 = arith.constant 64 : index // 尝试将buffer映射为target (!stream.resource<constant>) %did_map, %result = stream.resource.try_map %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c64} %0:2 = scf.if %did_map -> (!stream.resource<constant>, !stream.timepoint) { // 如果可以映射,则直接返回映射的结果(!stream.resource<constant>) %4 = stream.timepoint.immediate => !stream.timepoint scf.yield %result, %4 : !stream.resource<constant>, !stream.timepoint } else { // 如果不能映射,需要先将buffer映射为缓冲区(stage),然后申请一段新的空间并从缓冲区拷贝数据(copy)。 // 如果lifetime类型是Variable,则不需要try_map,直接走该分支(stage + copy)的实现。 %4 = stream.resource.map %buffer_cst[%c0] : !util.buffer -> !stream.resource<staging>{%c64} %5 = stream.resource.alloc uninitialized : !stream.resource<constant>{%c64} %6 = stream.cmd.execute with(%4 as %arg0: !stream.resource<staging>{%c64}, %5 as %arg1: !stream.resource<constant>{%c64}) { stream.cmd.copy %arg0[%c0], %arg1[%c0], %c64 : !stream.resource<staging>{%c64} -> !stream.resource<constant>{%c64} } => !stream.timepoint scf.yield %5, %6 : !stream.resource<constant>, !stream.timepoint } %1 = stream.resource.subview %0#0[%c0] : !stream.resource<constant>{%c64} -> !stream.resource<constant>{%c40} %2 = stream.cmd.execute with() { } => !stream.timepoint %3 = stream.timepoint.join max(%0#1, %2) => !stream.timepoint util.global.store %1, @_constant : !stream.resource<constant> util.global.store %3, @_constant__timepoint : !stream.timepoint util.initializer.return} IREE::Stream::createPackAllocationsPass 将包含多个resource的stream.resource.alloc 转换成 stream.resource.pack + stream.resource.alloc,并通过stream.resource.subview 获取每一个resource。 IREE::Stream::createLayoutSlicesPass 将stream.resource.pack转化为具体的内存复用算法计算过程。 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455func.func @predict(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { %c8 = arith.constant 8 : index %c40 = arith.constant 40 : index %c4 = arith.constant 4 : index %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %c10 = arith.constant 10 : index %c553648160_i32 = arith.constant 553648160 : i32 %c1_i32 = arith.constant 1 : i32 %c2 = arith.constant 2 : index hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c1, %c2]) type(%c553648160_i32) encoding(%c1_i32) %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<1x2xf32> in !stream.resource<external>{%c8} %c0_0 = arith.constant 0 : index // 申请输出resource的空间 %1 = stream.resource.alloc uninitialized : !stream.resource<external>{%c40} // 计算临时resource所需要的空间大小 %2:5 = stream.resource.pack slices({ [0, 2] = %c40, // [0, 2]是某个resource的lifetime,%40是resource size [1, 2] = %c4, [2, 4] = %c40, [3, 4] = %c4 }) : index // 申请临时resource的空间 %result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<transient>{%2#0} => !stream.timepoint %3 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg1: !stream.resource<external>{%c8}, %1 as %arg2: !stream.resource<external>{%c40}, %result as %arg3: !stream.resource<transient>{%2#0}) { stream.cmd.dispatch @predict_dispatch_0::@predict_dispatch_0_matmul_1x10x2[%c1, %c10] { ro %arg1[%c0 for %c8] : !stream.resource<external>{%c8}, wo %arg3[%2#1 for %c40] : !stream.resource<transient>{%2#0} } stream.cmd.dispatch @predict_dispatch_1::@predict_dispatch_1_generic_10[%c1] { ro %arg3[%2#1 for %c40] : !stream.resource<transient>{%2#0}, wo %arg3[%2#2 for %c4] : !stream.resource<transient>{%2#0} } stream.cmd.dispatch @predict_dispatch_2::@predict_dispatch_2_generic_1x10[%c1, %c10] { ro %arg3[%2#1 for %c40] : !stream.resource<transient>{%2#0}, ro %arg3[%2#2 for %c4] : !stream.resource<transient>{%2#0}, wo %arg3[%2#3 for %c40] : !stream.resource<transient>{%2#0} } stream.cmd.dispatch @predict_dispatch_3::@predict_dispatch_3_generic_10[%c1] { ro %arg3[%2#3 for %c40] : !stream.resource<transient>{%2#0}, wo %arg3[%2#4 for %c4] : !stream.resource<transient>{%2#0} } stream.cmd.dispatch @predict_dispatch_4::@predict_dispatch_4_generic_1x10[%c1, %c10] { ro %arg3[%2#3 for %c40] : !stream.resource<transient>{%2#0}, ro %arg3[%2#4 for %c4] : !stream.resource<transient>{%2#0}, wo %arg2[%c0_0 for %c40] : !stream.resource<external>{%c40} } } => !stream.timepoint // 释放申请的临时空间 %4 = stream.resource.dealloca await(%3) => %result : !stream.resource<transient>{%2#0} => !stream.timepoint %5 = stream.timepoint.join max(%4, %3) => !stream.timepoint %6 = stream.timepoint.await %5 => %1 : !stream.resource<external>{%c40} %7 = stream.tensor.export %6 : tensor<1x10xf32> in !stream.resource<external>{%c40} -> !hal.buffer_view return %7 : !hal.buffer_view} 转换成, 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152func.func @predict(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { %c8 = arith.constant 8 : index %c40 = arith.constant 40 : index %c4 = arith.constant 4 : index %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %c10 = arith.constant 10 : index %c553648160_i32 = arith.constant 553648160 : i32 %c1_i32 = arith.constant 1 : i32 %c2 = arith.constant 2 : index hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c1, %c2]) type(%c553648160_i32) encoding(%c1_i32) %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<1x2xf32> in !stream.resource<external>{%c8} %c0_0 = arith.constant 0 : index %1 = stream.resource.alloc uninitialized : !stream.resource<external>{%c40} %c0_1 = arith.constant 0 : index %c64 = arith.constant 64 : index %c64_2 = arith.constant 64 : index %c128 = arith.constant 128 : index %c128_3 = arith.constant 128 : index %c192 = arith.constant 192 : index %c192_4 = arith.constant 192 : index %result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<transient>{%c192_4} => !stream.timepoint %2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg1: !stream.resource<external>{%c8}, %1 as %arg2: !stream.resource<external>{%c40}, %result as %arg3: !stream.resource<transient>{%c192_4}) { stream.cmd.dispatch @predict_dispatch_0::@predict_dispatch_0_matmul_1x10x2[%c1, %c10] { ro %arg1[%c0 for %c8] : !stream.resource<external>{%c8}, wo %arg3[%c0_1 for %c40] : !stream.resource<transient>{%c192_4} } stream.cmd.dispatch @predict_dispatch_1::@predict_dispatch_1_generic_10[%c1] { ro %arg3[%c0_1 for %c40] : !stream.resource<transient>{%c192_4}, wo %arg3[%c64_2 for %c4] : !stream.resource<transient>{%c192_4} } stream.cmd.dispatch @predict_dispatch_2::@predict_dispatch_2_generic_1x10[%c1, %c10] { ro %arg3[%c0_1 for %c40] : !stream.resource<transient>{%c192_4}, ro %arg3[%c64_2 for %c4] : !stream.resource<transient>{%c192_4}, wo %arg3[%c128_3 for %c40] : !stream.resource<transient>{%c192_4} } stream.cmd.dispatch @predict_dispatch_3::@predict_dispatch_3_generic_10[%c1] { ro %arg3[%c128_3 for %c40] : !stream.resource<transient>{%c192_4}, wo %arg3[%c0_1 for %c4] : !stream.resource<transient>{%c192_4} } stream.cmd.dispatch @predict_dispatch_4::@predict_dispatch_4_generic_1x10[%c1, %c10] { ro %arg3[%c128_3 for %c40] : !stream.resource<transient>{%c192_4}, ro %arg3[%c0_1 for %c4] : !stream.resource<transient>{%c192_4}, wo %arg2[%c0_0 for %c40] : !stream.resource<external>{%c40} } } => !stream.timepoint %3 = stream.resource.dealloca await(%2) => %result : !stream.resource<transient>{%c192_4} => !stream.timepoint %4 = stream.timepoint.join max(%3, %2) => !stream.timepoint %5 = stream.timepoint.await %4 => %1 : !stream.resource<external>{%c40} %6 = stream.tensor.export %5 : tensor<1x10xf32> in !stream.resource<external>{%c40} -> !hal.buffer_view return %6 : !hal.buffer_view} IREE::Util::createPropagateSubrangesPass 把resource转换成 (resource, size, offset, length)的元组。 util.global 1util.global private @_constant : !stream.resource<constant> 转换成 1234util.global private @_constant : !stream.resource<constant>util.global private @_constant_size : indexutil.global private @_constant_offset : indexutil.global private @_constant_length : index util.global.load 1%0 = util.global.load @foo : !stream.resource 转换成 123456%0 = util.global.load @foo : !stream.resource%s = util.global.load @foo_size : index%o = util.global.load @foo_offset : index%l = util.global.load @foo_length : index%1 = stream.resource.subview %0[%o] : !stream.resource<*>{%s} -> !stream.resource<*>{%l} util.global.store 123%1 = stream.resource.subview %0[%o] : !stream.resource<*>{%s} -> !stream.resource<*>{%l}util.global.store %1, @foo : !stream.resource 转换成 1234util.global.store %0, @foo : !stream.resource // 这里语义是正确的吗???util.global.store %s, @foo_size : indexutil.global.store %o, @foo_offset : indexutil.global.store %l, @foo_length : index func.func 123func.func @foo(%0: !stream.resource) { ...} 转换成 1234func.func @foo(%0: !stream.resource, %sz: index, %o: index, %l: index) { %1 = stream.resource.subview %0[%o] : {%sz} -> {%l} ...} call 12%1 = stream.resource.subview %0[%o] : {%sz} -> {%l}%r = call @foo(%1) 转换成 12%r, %rsz, %ro, %rl = call @foo(%0, %sz, %o, %l)%2 = stream.resource.subview %r[%ro] : {%rsz} -> {%rl} return 12%1 = stream.resource.subview %0[%o] : {%sz} -> {%l}return %1 转换成 1return %0, %sz, %o, %l branch 12345%1 = stream.resource.subview %0[%o] : {%sz} -> {%l}br ^bb1(%1)^bb1(%b): ... 转换成 1234br ^bb1(%0, %sz, %o, %l) ^bb1(%a, %b, %c, %d): %1 = stream.resource.subview %a[%b] : {%c} -> {%d} cond_branch addCleanupPatterns IREE::Stream::createVerifyLoweringToCmdPass 验证program的合法性。 buildStreamOptimizationPassPipeline addCleanupPatterns mlir::createConvertSCFToCFPass 将structured control flow算子转换成更低层基础块形式的control flow算子。 123456789101112func.func @test(%pred: i32, %arg1: tensor<2x10xf32>, %arg2: tensor<2x10xf32>) -> tensor<2x10xf32> { %c0 = arith.constant 0 : i32 %0 = arith.cmpi sgt, %pred, %c0 : i32 %1 = scf.if %0 -> (tensor<2x10xf32>) { %2 = mhlo.add %arg1, %arg2 : tensor<2x10xf32> scf.yield %2 : tensor<2x10xf32> } else { %2 = mhlo.subtract %arg1, %arg2 : tensor<2x10xf32> scf.yield %2 : tensor<2x10xf32> } return %1 : tensor<2x10xf32>} 转换成 12345678910111213func.func @test(%pred: i32, %arg1: tensor<2x10xf32>, %arg2: tensor<2x10xf32>) -> tensor<2x10xf32> { %c0 = arith.constant 0 : i32 %0 = arith.cmpi sgt, %pred, %c0 : i32 cf.cond_br %0, ^bb1, ^bb2 ^bb1: %2 = mhlo.add %arg1, %arg2 : tensor<2x10xf32> cf.br ^bb3(%2 : tensor<2x10xf32>) ^bb2: %3 = mhlo.subtract %arg1, %arg2 : tensor<2x10xf32> cf.br ^bb3(%3 : tensor<2x10xf32>) ^bb3(%4: tensor<2x10xf32>): return %4 : tensor<2x10xf32>} addCleanupPatterns IREE::Stream::createElideTimepointsPass 消除已经确信到达的等待。比如 123%timepoint0 = ...%timepoint1 = ... await(%timepoint0)%timepoint2 = stream.timepoint.join max(%timepoint0, %timepoint1) timepoint1到达时timepoint0一定已经达到过,因此可以转换成, 123%timepoint0 = ...%timepoint1 = ... await(%timepoint0)%timepoint2 = stream.timepoint.join max(%timepoint1) canonicalization之后最终是 123%timepoint0 = ...%timepoint1 = ... await(%timepoint0)%timepoint2 = %timepoint1 IREE::Util::createFixedPointIteratorPass 该pass触发重复执行一个pass pipeline,直到达到固定迭代次数或最大迭代次数。这里的pipeline包括前面的addCleanupPatterns和createElideTimepointsPass两个子pass。 IREE::Stream::createFuseDispatchBindingsPass 根据stream.cmd.dispatch 的resource关系合并dispatch executable的bindings,比如stream.cmd.dispatch 两个resource是同一个地址的不同range,则可以计算每个resource在base地址上的偏移,并将这两个resource合并成一个binding,在dispatch executable中根据偏移来截取每个被合并的binding。该操作默认只合并read only的resource。 1234567891011121314151617181920212223242526272829303132333435363738stream.executable private @predict_dispatch_2 { stream.executable.export public @predict_dispatch_2_generic_1x10 workgroups(%arg0: index, %arg1: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @predict_dispatch_2_generic_1x10(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { %c0 = arith.constant 0 : index %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1x10xf32>> %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<f32>> %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1x10xf32>> %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x10xf32>> -> tensor<1x10xf32> %4 = flow.dispatch.tensor.load %1, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32> %5 = tensor.empty() : tensor<1x10xf32> %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%3, %4 : tensor<1x10xf32>, tensor<f32>) outs(%5 : tensor<1x10xf32>) { ^bb0(%in: f32, %in_0: f32, %out: f32): %7 = arith.subf %in, %in_0 : f32 %8 = math.exp %7 : f32 linalg.yield %8 : f32 } -> tensor<1x10xf32> flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [1, 10], strides = [1, 1] : tensor<1x10xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x10xf32>> return } }}func.func @predict(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { ... %2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg1: !stream.resource<external>{%c8}, %1 as %arg2: !stream.resource<external>{%c40}, %result as %arg3: !stream.resource<transient>{%c192}) { ... stream.cmd.dispatch @predict_dispatch_2::@predict_dispatch_2_generic_1x10[%c1, %c10] { ro %arg3[%c0 for %c40] : !stream.resource<transient>{%c192}, ro %arg3[%c64 for %c4] : !stream.resource<transient>{%c192}, wo %arg3[%c128 for %c40] : !stream.resource<transient>{%c192} } ... }} 转换成 12345678910111213141516171819202122232425262728293031323334353637383940stream.executable private @predict_dispatch_2 { stream.executable.export public @predict_dispatch_2_generic_1x10 workgroups(%arg0: index, %arg1: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @predict_dispatch_2_generic_1x10(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index) { %c0 = arith.constant 0 : index %0 = arith.addi %c0, %arg2 : index %1 = stream.binding.subspan %arg0[%0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1x10xf32>> %2 = arith.addi %c0, %arg3 : index %3 = stream.binding.subspan %arg0[%2] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<f32>> %4 = arith.addi %c0, %arg4 : index %5 = stream.binding.subspan %arg1[%4] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1x10xf32>> %6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x10xf32>> -> tensor<1x10xf32> %7 = flow.dispatch.tensor.load %3, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32> %8 = tensor.empty() : tensor<1x10xf32> %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6, %7 : tensor<1x10xf32>, tensor<f32>) outs(%8 : tensor<1x10xf32>) { ^bb0(%in: f32, %in_0: f32, %out: f32): %10 = arith.subf %in, %in_0 : f32 %11 = math.exp %10 : f32 linalg.yield %11 : f32 } -> tensor<1x10xf32> flow.dispatch.tensor.store %9, %5, offsets = [0, 0], sizes = [1, 10], strides = [1, 1] : tensor<1x10xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x10xf32>> return } }}func.func @predict(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { ... %2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg1: !stream.resource<external>{%c8}, %1 as %arg2: !stream.resource<external>{%c40}, %result as %arg3: !stream.resource<transient>{%c192}) { ... stream.cmd.dispatch @predict_dispatch_2::@predict_dispatch_2_generic_1x10[%c1, %c10](%c0, %c64, %c128 : index, index, index) { ro %arg3[%c0_0 for %c192] : !stream.resource<transient>{%c192}, wo %arg3[%c0_0 for %c192] : !stream.resource<transient>{%c192} } ... }} 可以看到stream.cmd.dispatch @predict_dispatch_2的resource被合并为2个,predict_dispatch_2_generic_1x10 dispatch executable参数中的binding也减少为2个,但增加了3个表示offset的index,被合并的binding根据offset来截取。 IREE::Stream::createPackDispatchOperandsPass 将dispatch executable参数中的标量/index类型转换成i32或i64类型。 123func.func @predict_dispatch_2_generic_1x10(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index) { ...} 转换成 123func.func @predict_dispatch_2_generic_1x10(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: i32, %arg3: i32, %arg4: i32) { ...} mlir::createCSEPass IREE::Stream::createFoldUniformOperandsPass 折叠dispatch executable的所有调用中相同的参数。 123stream.cmd.dispatch @foo(%c1, %c100 : index, index)stream.cmd.dispatch @foo(%c1, %c101 : index, index)stream.cmd.dispatch @foo2(%c1, %c101 : index, index) 转换成 123stream.cmd.dispatch @foo(%c100 : index)stream.cmd.dispatch @foo(%c101 : index)stream.cmd.dispatch @foo2() @foo内联了%c1,@foo2内联了%c1和%c101。 IREE::Stream::createAnnotateDispatchArgumentsPass 给dispatch executable的参数添加potential value和alignment信息。 123func.func @predict_dispatch_2_generic_1x10(%arg0: !stream.binding, %arg1: !stream.binding) { ...} 转换为 123func.func @predict_dispatch_2_generic_1x10(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}) { ...} IREE::Stream::createMemoizeChannelsPass 找出所有stream.channel.default ops,为每一个stream.channel.default op创建一个全局缓冲区,同时在初始化时创建对应的channel,并将channel结果写入全局缓冲区,最后将该stream.channel.default op替换为全局缓冲区的util.global.load op。 addCleanupPatterns mlir::createSymbolDCEPass","categories":[{"name":"DL Compiler","slug":"DL-Compiler","permalink":"https://hjchen2.github.io/categories/DL-Compiler/"}],"tags":[{"name":"Deep Learning Compiler","slug":"Deep-Learning-Compiler","permalink":"https://hjchen2.github.io/tags/Deep-Learning-Compiler/"},{"name":"IREE","slug":"IREE","permalink":"https://hjchen2.github.io/tags/IREE/"}]},{"title":"IREE编译流程解析(四)","slug":"IREE编译流程4","date":"2023-01-04T13:15:20.000Z","updated":"2023-02-17T11:57:53.436Z","comments":true,"path":"2023/01/04/IREE编译流程4/","link":"","permalink":"https://hjchen2.github.io/2023/01/04/IREE%E7%BC%96%E8%AF%91%E6%B5%81%E7%A8%8B4/","excerpt":"IREE Flow::buildFlowTransformPassPipeline主要作用是执行一系列窥孔优化,比如1x1的conv2d转换成matmul、tiling、op fusion等,最终将workload拆分成flow.executable。相关的passes及其作用如下。","text":"IREE Flow::buildFlowTransformPassPipeline主要作用是执行一系列窥孔优化,比如1x1的conv2d转换成matmul、tiling、op fusion等,最终将workload拆分成flow.executable。相关的passes及其作用如下。 IREE::Util::createDemoteF64ToF32Pass 将F64类型窄化为F32。 IREE::Flow::createConvertConv2D1x1ToMatmulPass 将1x1的linalg.conv_2d_nhwc_hwcf转换成linalg.matmul。 123456789101112131415161718// func.func @conv(%input : tensor<1x2x2x3xf32>, %filter: tensor<1x1x3x4xf32>) -> tensor<1x2x2x4xf32> {// %0 = mhlo.convolution(%input, %filter)// dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f],// window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]}// {batch_group_count = 1 : i64, feature_group_count = 1 : i64}// : (tensor<1x2x2x3xf32>, tensor<1x1x3x4xf32>) -> tensor<1x2x2x4xf32>// return %0 : tensor<1x2x2x4xf32>// }func.func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { %cst = arith.constant 0.000000e+00 : f32 %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<1x2x2x3xf32> %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<1x1x3x4xf32> %2 = linalg.init_tensor [1, 2, 2, 4] : tensor<1x2x2x4xf32> %3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<1x2x2x4xf32>) -> tensor<1x2x2x4xf32> %4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%0, %1 : tensor<1x2x2x3xf32>, tensor<1x1x3x4xf32>) outs(%3 : tensor<1x2x2x4xf32>) -> tensor<1x2x2x4xf32> %5 = hal.tensor.export %4 : tensor<1x2x2x4xf32> -> !hal.buffer_view return %5 : !hal.buffer_view} 转换成, 1234567891011121314func.func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { %cst = arith.constant 0.000000e+00 : f32 %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<1x2x2x3xf32> %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<1x1x3x4xf32> %2 = linalg.init_tensor [1, 2, 2, 4] : tensor<1x2x2x4xf32> %3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<1x2x2x4xf32>) -> tensor<1x2x2x4xf32> %4 = tensor.collapse_shape %0 [[0, 1, 2], [3]] : tensor<1x2x2x3xf32> into tensor<4x3xf32> %5 = tensor.collapse_shape %1 [[0, 1, 2], [3]] : tensor<1x1x3x4xf32> into tensor<3x4xf32> %6 = tensor.collapse_shape %3 [[0, 1, 2], [3]] : tensor<1x2x2x4xf32> into tensor<4x4xf32> %7 = linalg.matmul ins(%4, %5 : tensor<4x3xf32>, tensor<3x4xf32>) outs(%6 : tensor<4x4xf32>) -> tensor<4x4xf32> %8 = tensor.expand_shape %7 [[0, 1, 2], [3]] : tensor<4x4xf32> into tensor<1x2x2x4xf32> %9 = hal.tensor.export %8 : tensor<1x2x2x4xf32> -> !hal.buffer_view return %9 : !hal.buffer_view} IREE::Flow::createConvertConv2DToImg2ColPass 将conv2d转换成img2col。默认不开启。 123456789101112131415// %0 = mhlo.convolution(%input, %filter)// dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f],// window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]}// {batch_group_count = 1 : i64, feature_group_count = 1 : i64}// : (tensor<1x4x4x3xf32>, tensor<2x2x3x4xf32>) -> tensor<1x3x3x4xf32>func.func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { %cst = arith.constant 0.000000e+00 : f32 %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<1x4x4x3xf32> %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<2x2x3x4xf32> %2 = linalg.init_tensor [1, 3, 3, 4] : tensor<1x3x3x4xf32> %3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<1x3x3x4xf32>) -> tensor<1x3x3x4xf32> %4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%0, %1 : tensor<1x4x4x3xf32>, tensor<2x2x3x4xf32>) outs(%3 : tensor<1x3x3x4xf32>) -> tensor<1x3x3x4xf32> %5 = hal.tensor.export %4 : tensor<1x3x3x4xf32> -> !hal.buffer_view return %5 : !hal.buffer_view} 转换成, 12345678910111213141516171819func.func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { %cst = arith.constant 0.000000e+00 : f32 %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<1x4x4x3xf32> %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<2x2x3x4xf32> %2 = linalg.init_tensor [1, 3, 3, 4] : tensor<1x3x3x4xf32> %3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<1x3x3x4xf32>) -> tensor<1x3x3x4xf32> %4 = linalg.init_tensor [1, 3, 3, 2, 2, 3] : tensor<1x3x3x2x2x3xf32> %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1 + d3, d2 + d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<1x4x4x3xf32>) outs(%4 : tensor<1x3x3x2x2x3xf32>) { ^bb0(%arg2: f32, %arg3: f32): linalg.yield %arg2 : f32 } -> tensor<1x3x3x2x2x3xf32> %6 = tensor.collapse_shape %5 [[0, 1, 2], [3, 4, 5]] : tensor<1x3x3x2x2x3xf32> into tensor<9x12xf32> %7 = tensor.collapse_shape %1 [[0, 1, 2], [3]] : tensor<2x2x3x4xf32> into tensor<12x4xf32> %8 = tensor.collapse_shape %3 [[0, 1, 2], [3]] : tensor<1x3x3x4xf32> into tensor<9x4xf32> %9 = linalg.matmul ins(%6, %7 : tensor<9x12xf32>, tensor<12x4xf32>) outs(%8 : tensor<9x4xf32>) -> tensor<9x4xf32> %10 = tensor.expand_shape %9 [[0, 1, 2], [3]] : tensor<9x4xf32> into tensor<1x3x3x4xf32> %11 = hal.tensor.export %10 : tensor<1x3x3x4xf32> -> !hal.buffer_view return %11 : !hal.buffer_view} IREE::Flow::createDetachElementwiseFromNamedOpsPass 将buffer = linalg.generic_op + linalg.named_payload_op转换成tmp_buffer = linalg.named_payload_op; buffer = linalg.generic_op + tmp_buffer,主要目的是将上游的generic op和named_payload_op分隔开,使得named_payload_op的结果写到一块新的buffer。 123456789101112131415161718func.func @test(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { %cst = arith.constant 0.000000e+00 : f32 %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<1x4x4x3xf32> %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<2x2x3x4xf32> %2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<1x3x3x4xf32> %3 = linalg.init_tensor [1, 3, 3, 4] : tensor<1x3x3x4xf32> %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<1x3x3x4xf32>) -> tensor<1x3x3x4xf32> %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2 : tensor<1x3x3x4xf32>) outs(%4 : tensor<1x3x3x4xf32>) { ^bb0(%arg3: f32, %arg4: f32): %8 = arith.addf %arg3, %arg3 : f32 linalg.yield %8 : f32 } -> tensor<1x3x3x4xf32> %6 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%0, %1 : tensor<1x4x4x3xf32>, tensor<2x2x3x4xf32>) outs(%5 : tensor<1x3x3x4xf32>) -> tensor<1x3x3x4xf32> %7 = hal.tensor.export %6 : tensor<1x3x3x4xf32> -> !hal.buffer_view return %7 : !hal.buffer_view} 转换成, 1234567891011121314151617181920212223242526func.func @test(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { %cst = arith.constant 0.000000e+00 : f32 %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<1x4x4x3xf32> %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<2x2x3x4xf32> %2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<1x3x3x4xf32> %3 = linalg.init_tensor [1, 3, 3, 4] : tensor<1x3x3x4xf32> %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<1x3x3x4xf32>) -> tensor<1x3x3x4xf32> %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2 : tensor<1x3x3x4xf32>) outs(%4 : tensor<1x3x3x4xf32>) { ^bb0(%arg3: f32, %arg4: f32): %11 = arith.addf %arg3, %arg3 : f32 linalg.yield %11 : f32 } -> tensor<1x3x3x4xf32> %6 = linalg.init_tensor [1, 3, 3, 4] : tensor<1x3x3x4xf32> %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<1x3x3x4xf32>) -> tensor<1x3x3x4xf32> %8 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%0, %1 : tensor<1x4x4x3xf32>, tensor<2x2x3x4xf32>) outs(%7 : tensor<1x3x3x4xf32>) -> tensor<1x3x3x4xf32> %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8, %5 : tensor<1x3x3x4xf32>, tensor<1x3x3x4xf32>) outs(%7 : tensor<1x3x3x4xf32>) { ^bb0(%arg3: f32, %arg4: f32, %arg5: f32): %11 = arith.addf %arg3, %arg4 : f32 linalg.yield %11 : f32 } -> tensor<1x3x3x4xf32> %10 = hal.tensor.export %9 : tensor<1x3x3x4xf32> -> !hal.buffer_view return %10 : !hal.buffer_view} IREE::Flow::createVerifyInputLegalityPass 验证program是否合法。 IREE::Flow::createConvertLinalgMatmulToMmt4DPass 将2d的linalg.matmul tiling成linalg.mmt4d。默认不开启,可通过--iree-flow-mmt4d-target-options=\"enable_generic_slow arch=cuda\"选项开启。 12345678910func.func @test(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { %cst = arith.constant 0.000000e+00 : f32 %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<128x256xf32> %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<256x256xf32> %2 = linalg.init_tensor [128, 256] : tensor<128x256xf32> %3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<128x256xf32>) -> tensor<128x256xf32> %4 = linalg.matmul ins(%0, %1 : tensor<128x256xf32>, tensor<256x256xf32>) outs(%3 : tensor<128x256xf32>) -> tensor<128x256xf32> %5 = hal.tensor.export %4 : tensor<128x256xf32> -> !hal.buffer_view return %5 : !hal.buffer_view} 转换成, 1234567891011121314151617181920212223242526272829303132333435func.func @test(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { %cst = arith.constant 0.000000e+00 : f32 %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<128x256xf32> %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<256x256xf32> %2 = linalg.init_tensor [128, 256] : tensor<128x256xf32> %3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<128x256xf32>) -> tensor<128x256xf32> %4 = tensor.expand_shape %0 [[0, 1], [2, 3]] : tensor<128x256xf32> into tensor<16x8x128x2xf32> %5 = tensor.expand_shape %1 [[0, 1], [2, 3]] : tensor<256x256xf32> into tensor<128x2x64x4xf32> %6 = tensor.expand_shape %3 [[0, 1], [2, 3]] : tensor<128x256xf32> into tensor<16x8x64x4xf32> %7 = linalg.init_tensor [16, 128, 8, 2] : tensor<16x128x8x2xf32> %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4 : tensor<16x8x128x2xf32>) outs(%7 : tensor<16x128x8x2xf32>) { ^bb0(%arg2: f32, %arg3: f32): linalg.yield %arg2 : f32 } -> tensor<16x128x8x2xf32> %9 = linalg.init_tensor [64, 128, 4, 2] : tensor<64x128x4x2xf32> %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d1, d3, d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<128x2x64x4xf32>) outs(%9 : tensor<64x128x4x2xf32>) { ^bb0(%arg2: f32, %arg3: f32): linalg.yield %arg2 : f32 } -> tensor<64x128x4x2xf32> %11 = linalg.init_tensor [16, 64, 8, 4] : tensor<16x64x8x4xf32> %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%6 : tensor<16x8x64x4xf32>) outs(%11 : tensor<16x64x8x4xf32>) { ^bb0(%arg2: f32, %arg3: f32): linalg.yield %arg2 : f32 } -> tensor<16x64x8x4xf32> // 16 x (128x8x2) @ 64 x (128x4x2) => 16 x 64 x sum_{128}(8x2 * (4x2)^T) %13 = linalg.mmt4d {comment = "generic tiling parameters, as no known kernel was matched for this matmul and target"} ins(%8, %10 : tensor<16x128x8x2xf32>, tensor<64x128x4x2xf32>) outs(%12 : tensor<16x64x8x4xf32>) -> tensor<16x64x8x4xf32> %14 = linalg.init_tensor [16, 8, 64, 4] : tensor<16x8x64x4xf32> %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%13 : tensor<16x64x8x4xf32>) outs(%14 : tensor<16x8x64x4xf32>) { ^bb0(%arg2: f32, %arg3: f32): linalg.yield %arg2 : f32 } -> tensor<16x8x64x4xf32> %16 = tensor.collapse_shape %15 [[0, 1], [2, 3]] : tensor<16x8x64x4xf32> into tensor<128x256xf32> %17 = hal.tensor.export %16 : tensor<128x256xf32> -> !hal.buffer_view return %17 : !hal.buffer_view} IREE::Flow::createPadLinalgOpsToIntegerMultiplePass 将matmul的M、N和K扩充到paddingSize的整数倍,paddingSize默认为4。 mlir::createLinalgNamedOpConversionPass 将depth_multiplier=1的linalg.depthwise_conv_2d_nhwc_hwcm转换成linalg.depthwise_conv_2d_nhwc_hwc,将depth_multiplier=1的linalg.depthwise_conv_2d_nhwc_hwcm_q转换成linalg.depthwise_conv_2d_nhwc_hwc_q。 depth_multiplier的作用见 https://www.tensorflow.org/api_docs/python/tf/keras/layers/DepthwiseConv2D 。 1The number of depthwise convolution output channels for each input channel. The total number of depthwise convolution output channels will be equal to filters_in * depth_multiplier. IREE::Flow::createExpandTensorShapesPass 将dynamic tensor扩充为tensor + dynamic dim的对偶形式,这么做的一个好处是动态维度可以直接参与计算和推导。比如 123456789101112131415161718192021// func.func private @add(%arg0 : tensor<?x2xf32>, %arg1 : tensor<?x2xf32>) -> tensor<?x2xf32>// iree_input.global private mutable @param : tensor<?x2xf32>// func.func @run(%arg0 : tensor<?x2xf32>) -> tensor<?x2xf32> {// %0 = iree_input.global.load @param : tensor<?x2xf32>// %1 = call @add(%0, %arg0) : (tensor<?x2xf32>, tensor<?x2xf32>) -> tensor<?x2xf32>// iree_input.global.store %1, @param : tensor<?x2xf32>// return %1 : tensor<?x2xf32>// }func.func private @add(!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub}util.global private mutable @param : tensor<?x2xf32>func.func @run(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { %c0 = arith.constant 0 : index %param = util.global.load @param : tensor<?x2xf32> %dim = tensor.dim %param, %c0 : tensor<?x2xf32> %0 = hal.tensor.export %param : tensor<?x2xf32>{%dim} -> !hal.buffer_view %1 = call @add(%0, %arg0) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view %2 = hal.buffer_view.dim<%1 : !hal.buffer_view>[0] : index %3 = hal.tensor.import %1 : !hal.buffer_view -> tensor<?x2xf32>{%2} util.global.store %3, @param : tensor<?x2xf32> return %1 : !hal.buffer_view} 被转换成, 1234567891011121314151617func.func private @add(!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub}util.global private mutable @param : tensor<?x2xf32>util.global private mutable @param__d0 : indexfunc.func @run(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { %c0 = arith.constant 0 : index %param = util.global.load @param : tensor<?x2xf32> %param__d0 = util.global.load @param__d0 : index %0 = flow.tensor.tie_shape %param : tensor<?x2xf32>{%param__d0} %dim = tensor.dim %0, %c0 : tensor<?x2xf32> %1 = hal.tensor.export %0 : tensor<?x2xf32>{%dim} -> !hal.buffer_view %2 = call @add(%1, %arg0) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view %3 = hal.buffer_view.dim<%2 : !hal.buffer_view>[0] : index %4 = hal.tensor.import %2 : !hal.buffer_view -> tensor<?x2xf32>{%3} util.global.store %4, @param : tensor<?x2xf32> util.global.store %3, @param__d0 : index return %2 : !hal.buffer_view} 从中可以看出几点变化: global tensor增加了一个表示动态维度的global index。 12345util.global private mutable @param : tensor<?x2xf32>转换成:util.global private mutable @param : tensor<?x2xf32>util.global private mutable @param__d0 : index global load 123456%param = util.global.load @param : tensor<?x2xf32>转换成:%param = util.global.load @param : tensor<?x2xf32>%param__d0 = util.global.load @param__d0 : index%0 = flow.tensor.tie_shape %param : tensor<?x2xf32>{%param__d0} global store 12345util.global.store %3, @param : tensor<?x2xf32>转换成:util.global.store %4, @param : tensor<?x2xf32>util.global.store %3, @param__d0 : index buildGlobalOptimizationPassPipeline IREE::Util::createSimplifyGlobalAccessesPass 这个pass主要做这几件事: 将不可变global tensor的load提前到了block的开头,将global tensor的store安全地挪到block的结尾。 进行以下化简: 如果load after store,则把load直接替换成store的source。比如, 123store %0, @p%1 = load @preturn %1 会被转换成, 12store %0, @preturn %0 如果store after store,则直接消除前一个store 12store %0, @pstore %1, @p 会被转换成, 1store %1, @p 如果load after load,则消除后一个load 123%0 = load @p%1 = load @preturn %1 会被转换成, 12%0 = load @preturn %0 一个完整的例子: 123456789101112131415161718func.func private @add(!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub}util.global private mutable @param0 : tensor<1x2xf32>util.global private @param1 : tensor<1x2xf32>func.func @run(%arg0: !hal.buffer_view) attributes {iree.abi.stub} { %param0 = util.global.load @param0 : tensor<1x2xf32> %0 = hal.tensor.export %param0 : tensor<1x2xf32> -> !hal.buffer_view %1 = call @add(%0, %arg0) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view %2 = hal.tensor.import %1 : !hal.buffer_view -> tensor<1x2xf32> util.global.store %2, @param0 : tensor<1x2xf32> %param0_0 = util.global.load @param0 : tensor<1x2xf32> %param1 = util.global.load @param1 : tensor<1x2xf32> %3 = hal.tensor.export %param0_0 : tensor<1x2xf32> -> !hal.buffer_view %4 = hal.tensor.export %param1 : tensor<1x2xf32> -> !hal.buffer_view %5 = call @add(%3, %4) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view %6 = hal.tensor.import %5 : !hal.buffer_view -> tensor<1x2xf32> util.global.store %6, @param0 : tensor<1x2xf32> return} 转换成, 1234567891011121314151617func.func private @add(!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} util.global private mutable @param0 : tensor<1x2xf32> util.global private @param1 : tensor<1x2xf32> func.func @run(%arg0: !hal.buffer_view) attributes {iree.abi.stub} { %param0 = util.global.load @param0 : tensor<1x2xf32> %param1 = util.global.load @param1 : tensor<1x2xf32> %0 = hal.tensor.export %param0 : tensor<1x2xf32> -> !hal.buffer_view %1 = call @add(%0, %arg0) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view %2 = hal.tensor.import %1 : !hal.buffer_view -> tensor<1x2xf32> %3 = hal.tensor.export %2 : tensor<1x2xf32> -> !hal.buffer_view %4 = hal.tensor.export %param1 : tensor<1x2xf32> -> !hal.buffer_view util.global.store %2, @param0 : tensor<1x2xf32> %5 = call @add(%3, %4) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view %6 = hal.tensor.import %5 : !hal.buffer_view -> tensor<1x2xf32> util.global.store %6, @param0 : tensor<1x2xf32> return } 这个例子中将param1的load操作提前,并且将%param0_0 = util.global.load @param0 : tensor<1x2xf32>直接替换为%2。 IREE::Util::createApplyPatternsPass 执行IREE::Util dialect ODS中定义的Canonicalization Patterns,并执行block和跳转命令参数化简操作。 block参数化简 123br ^bb1(%0, %0 : index, index)^bb1(%arg0: index, %arg1: index): ... 折叠相同的参数,化简为 123br ^bb1(%0 : index)^bb1(%arg0: index): // %arg1 remapped to %arg0 ... 跳转命令参数消除 12345func.func @foo(%arg0: index) { br ^bb1(%arg0 : index) ^bb1(%0: index): ...} 消除参数后, 12345func.func @foo(%arg0: index) { br ^bb1 ^bb1: // %0 remapped to %arg0 ...} IREE::Util::createFoldGlobalsPass 这个pass继续对global tensor的load和store操作进行优化,主要包括: 内联常量store,比如 123456util.global mutable @a : i32func.func @fool { %c5 = arith.constant 5 : i32 util.global.store %c5, @a : i32 return} 转换成, 1util.global @a = 5 : i32 內联常量load,比如 12345util.global @a = 5 : i32func.func @fool { %1 = util.global.load @a : i32 ...} 转换成, 1234func.func @fool { %1 = arith.constant 5 : i32 ...} 重命名互为链式的global tensor。 如果一个mutable global tensor只在init函数中被store过,则将它修改为immutable。 删除没有load过的global tensor。 合并相同初始值的immutable global tensor。 IREE::Util::createHoistIntoGlobalsPass IREE::Flow::createTensorPadToTensorInsertSlicePass 将tensor.pad转换为linalg.fill + tensor.insert_slice。 12345678910func.func @foo(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { %cst = arith.constant 0.000000e+00 : f32 %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<1x1xf32> %padded = tensor.pad %0 low[1, 2] high[3, 4] { ^bb0(%arg1: index, %arg2: index): tensor.yield %cst : f32 } : tensor<1x1xf32> to tensor<5x7xf32> %1 = hal.tensor.export %padded : tensor<5x7xf32> -> !hal.buffer_view return %1 : !hal.buffer_view} 转换为, 123456789func.func @foo(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { %cst = arith.constant 0.000000e+00 : f32 %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<1x1xf32> %1 = tensor.empty() : tensor<5x7xf32> %2 = linalg.fill ins(%cst : f32) outs(%1 : tensor<5x7xf32>) -> tensor<5x7xf32> %inserted_slice = tensor.insert_slice %0 into %2[1, 2] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<5x7xf32> %3 = hal.tensor.export %inserted_slice : tensor<5x7xf32> -> !hal.buffer_view return %3 : !hal.buffer_view} mlir::createConvertElementwiseToLinalgPass 把elementwise算子(带有Elementwise traits的op)转换成linalg generic op,方便后续对elementwise op做算子融合。arith dialect和math dialect的op都是Elementwise的,所以实际上这个pass会把arith dialect和math dialect lower到linalg dialect。 123456func.func @foo(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<2x3xf32> %1 = arith.addf %0, %0 : tensor<2x3xf32> %2 = hal.tensor.export %1 : tensor<2x3xf32> -> !hal.buffer_view return %2 : !hal.buffer_view} 转换成, 12345678910func.func @foo(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<2x3xf32> %1 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0, %0 : tensor<2x3xf32>, tensor<2x3xf32>) outs(%0 : tensor<2x3xf32>) { ^bb0(%in: f32, %in_0: f32, %out: f32): %3 = arith.addf %in, %in_0 : f32 linalg.yield %3 : f32 } -> tensor<2x3xf32> %2 = hal.tensor.export %1 : tensor<2x3xf32> -> !hal.buffer_view return %2 : !hal.buffer_view} mlir::createLinalgFoldUnitExtentDimsPass 消除长度为1的维度或者循环。 12345678910func.func @foo(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<1x3xf32> %1 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<1x3xf32>) outs(%0 : tensor<1x3xf32>) { ^bb0(%in: f32, %out: f32): %3 = arith.addf %in, %in : f32 linalg.yield %3 : f32 } -> tensor<1x3xf32> %2 = hal.tensor.export %1 : tensor<1x3xf32> -> !hal.buffer_view return %2 : !hal.buffer_view} 转换成, 12345678910111213func.func @foo(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<1x3xf32> %collapsed = tensor.collapse_shape %0 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32> %collapsed_0 = tensor.collapse_shape %0 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32> %1 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%collapsed : tensor<3xf32>) outs(%collapsed_0 : tensor<3xf32>) { ^bb0(%in: f32, %out: f32): %3 = arith.addf %in, %in : f32 linalg.yield %3 : f32 } -> tensor<3xf32> %expanded = tensor.expand_shape %1 [[0, 1]] : tensor<3xf32> into tensor<1x3xf32> %2 = hal.tensor.export %expanded : tensor<1x3xf32> -> !hal.buffer_view return %2 : !hal.buffer_view} 可以看到其中的linalg.generic由2层循环缩减成了单层循环。 createInterchangeGenericOpsPass 循环维度变换。将reduction循环维度交换到最内层,相应的parallel循环维度被交换到外层。 1234567891011121314// sum(%arg0: tensor<2x3xf32>, 0) -> tensor<3xf32>func.func @foo(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { %cst = arith.constant 0.000000e+00 : f32 %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<2x3xf32> %1 = tensor.empty() : tensor<3xf32> %2 = linalg.fill ins(%cst : f32) outs(%1 : tensor<3xf32>) -> tensor<3xf32> %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>], iterator_types = ["reduction", "parallel"]} ins(%0 : tensor<2x3xf32>) outs(%2 : tensor<3xf32>) { ^bb0(%in: f32, %out: f32): %5 = arith.addf %in, %out : f32 linalg.yield %5 : f32 } -> tensor<3xf32> %4 = hal.tensor.export %3 : tensor<3xf32> -> !hal.buffer_view return %4 : !hal.buffer_view} 交换循环之后转换成, 12345678910111213func.func @foo(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { %cst = arith.constant 0.000000e+00 : f32 %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<2x3xf32> %1 = tensor.empty() : tensor<3xf32> %2 = linalg.fill ins(%cst : f32) outs(%1 : tensor<3xf32>) -> tensor<3xf32> %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%0 : tensor<2x3xf32>) outs(%2 : tensor<3xf32>) { ^bb0(%in: f32, %out: f32): %5 = arith.addf %in, %out : f32 linalg.yield %5 : f32 } -> tensor<3xf32> %4 = hal.tensor.export %3 : tensor<3xf32> -> !hal.buffer_view return %4 : !hal.buffer_view} memref::createResolveShapedTypeResultDimsPass mlir::createCanonicalizerPass mlir::createCSEPass createFusionOfTensorOpsPass 主要做elementwise的算子融合,其次也会将tensor.expand_shape转换成linalg generic op,方便进行算子融合。 elementwise算子融合的条件: producer和comsumer都是linalg generic op,且都为tensor语义。 producer只有一个use。 producer所有维度的迭代类型都是parallel,consumer的index map必须和producer具有相同的循环嵌套层数。 producer结果的index map必须是Permutation,即结果的每个元素有且仅store一次(输出是pointwise的)。 consumer可以包含reduction迭代类型,但需要保证融合后输入的index map可以覆盖每一个迭代维度,理由是如果缺失就无法确定该维度的循环边界。 12345678910111213141516171819202122232425262728// reduce(mul(arg0, arg1), 0)// for (int d0 = 0; d0 < n; ++d0) {// temp[d0] = arg0[d0] * arg1[d0];// }// result = 0;// for (int d0 = 0; d0 < n; ++d0) {// result += temp[d0];// }func.func @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { %cst = arith.constant 0.000000e+00 : f32 %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<2xf32> %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<2xf32> %2 = tensor.empty() : tensor<2xf32> %3 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0, %1 : tensor<2xf32>, tensor<2xf32>) outs(%2 : tensor<2xf32>) { ^bb0(%in: f32, %in_0: f32, %out: f32): %8 = arith.mulf %in, %in_0 : f32 linalg.yield %8 : f32 } -> tensor<2xf32> %4 = tensor.empty() : tensor<f32> %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<f32>) -> tensor<f32> %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%3 : tensor<2xf32>) outs(%5 : tensor<f32>) { ^bb0(%in: f32, %out: f32): %8 = arith.addf %in, %out : f32 linalg.yield %8 : f32 } -> tensor<f32> %7 = hal.tensor.export %6 : tensor<f32> -> !hal.buffer_view return %7 : !hal.buffer_view} 融合mul和reduce之后转换成, 12345678910111213141516171819// result = 0;// for (int d0 = 0; d0 < n; ++d0) {// result += arg0[d0] * arg1[d0];// }func.func @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { %cst = arith.constant 0.000000e+00 : f32 %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<2xf32> %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<2xf32> %2 = tensor.empty() : tensor<f32> %3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<f32>) -> tensor<f32> %4 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%0, %1 : tensor<2xf32>, tensor<2xf32>) outs(%3 : tensor<f32>) { ^bb0(%in: f32, %in_0: f32, %out: f32): %6 = arith.mulf %in, %in_0 : f32 %7 = arith.addf %6, %out : f32 linalg.yield %7 : f32 } -> tensor<f32> %5 = hal.tensor.export %4 : tensor<f32> -> !hal.buffer_view return %5 : !hal.buffer_view} mlir::createLinalgDetensorizePass 将0-D Tensor转换为它的基础元素类型。 mlir::createCanonicalizerPass mlir::createCSEPass createSplitReductionPass 将matmul和topk的单次reduce分成两次reduce操作(一次batch matmul和一次add)。默认不开启,设置--iree-flow-split-matmul-reduction选项>=2可开启。 12345678910func.func @test(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { %cst = arith.constant 0.000000e+00 : f32 %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<128x256xf32> %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<256x256xf32> %2 = linalg.init_tensor [128, 256] : tensor<128x256xf32> %3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<128x256xf32>) -> tensor<128x256xf32> %4 = linalg.matmul ins(%0, %1 : tensor<128x256xf32>, tensor<256x256xf32>) outs(%3 : tensor<128x256xf32>) -> tensor<128x256xf32> %5 = hal.tensor.export %4 : tensor<128x256xf32> -> !hal.buffer_view return %5 : !hal.buffer_view} --iree-flow-split-matmul-reduction=2转换成, 123456789101112131415161718192021222324func.func @test(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { %cst = arith.constant 0.000000e+00 : f32 %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<128x256xf32> %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<256x256xf32> %2 = linalg.init_tensor [128, 256] : tensor<128x256xf32> %3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<128x256xf32>) -> tensor<128x256xf32> %4 = tensor.expand_shape %0 [[0], [1, 2]] : tensor<128x256xf32> into tensor<128x2x128xf32> %5 = tensor.expand_shape %1 [[0, 1], [2]] : tensor<256x256xf32> into tensor<2x128x256xf32> %6 = linalg.init_tensor [2, 128, 256] : tensor<2x128x256xf32> %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<2x128x256xf32>) -> tensor<2x128x256xf32> %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d1, d0, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%4, %5 : tensor<128x2x128xf32>, tensor<2x128x256xf32>) outs(%7 : tensor<2x128x256xf32>) attrs = {__internal_linalg_transform__ = "SPLIT", linalg.memoized_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]} { ^bb0(%arg2: f32, %arg3: f32, %arg4: f32): %11 = arith.mulf %arg2, %arg3 : f32 %12 = arith.addf %arg4, %11 : f32 linalg.yield %12 : f32 } -> tensor<2x128x256xf32> %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>], iterator_types = ["reduction", "parallel", "parallel"]} ins(%8 : tensor<2x128x256xf32>) outs(%3 : tensor<128x256xf32>) attrs = {__internal_linalg_transform__ = "SPLIT"} { ^bb0(%arg2: f32, %arg3: f32): %11 = arith.addf %arg2, %arg3 : f32 linalg.yield %11 : f32 } -> tensor<128x256xf32> %10 = hal.tensor.export %9 : tensor<128x256xf32> -> !hal.buffer_view return %10 : !hal.buffer_view} createInterchangeGenericOpsPass 循环维度变换。将reduction循环维度交换到最内层,相应的parallel循环维度被交换到外层。 createInterchangeTransposeGenericOpsPass 当输入indexing map是permutation时,交换循环维度使得输入的indexing map是identity的,其作用是使得输入尽可能变成连续访存。 createDispatchWithTransformDialect 根据transform dialect对算子进行调度和派遣,需要另外加载一个transform dialect的module文件,默认不做该变换。transform dialect定义了一套调度规则,用于引导目标IR进行变换,比如循环展开、tiling等。 createFormDispatchRegionsPass 以包含reduction loop的linalg op或named linalg op为中心(root),按一定规则合并producers和comsumers,划分出dispatch region子图。dispatch region是IREE中的原子执行单元,dispatch region内部可以直接复用输入和输出的内存,从而避免了内部的内存分配操作,内存分配只发生在dispatch region的边界,同时dispatch region之间会自动插入同步操作。 12345678910111213141516func.func @predict(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { %cst = arith.constant 0.000000e+00 : f32 %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<2x10xf32> %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<10x5xf32> %2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<5xf32> %3 = tensor.empty() : tensor<2x5xf32> %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x5xf32>) -> tensor<2x5xf32> %5 = linalg.matmul ins(%0, %1 : tensor<2x10xf32>, tensor<10x5xf32>) outs(%4 : tensor<2x5xf32>) -> tensor<2x5xf32> %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%5, %2 : tensor<2x5xf32>, tensor<5xf32>) outs(%3 : tensor<2x5xf32>) { ^bb0(%in: f32, %in_0: f32, %out: f32): %8 = arith.addf %in, %in_0 : f32 linalg.yield %8 : f32 } -> tensor<2x5xf32> %7 = hal.tensor.export %6 : tensor<2x5xf32> -> !hal.buffer_view return %7 : !hal.buffer_view} 转换成, 12345678910111213141516171819202122232425262728293031func.func @predict(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { %cst = arith.constant 0.000000e+00 : f32 %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<2x10xf32> %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<10x5xf32> %2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<5xf32> %3 = tensor.empty() : tensor<2x5xf32> %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x5xf32>) -> tensor<2x5xf32> %c1 = arith.constant 1 : index %c0 = arith.constant 0 : index %c2 = arith.constant 2 : index %c1_0 = arith.constant 1 : index %5 = affine.apply affine_map<()[s0, s1, s2] -> ((s1 - s0) ceildiv s2)>()[%c0, %c2, %c1_0] %c0_1 = arith.constant 0 : index %c5 = arith.constant 5 : index %c1_2 = arith.constant 1 : index %6 = affine.apply affine_map<()[s0, s1, s2] -> ((s1 - s0) ceildiv s2)>()[%c0_1, %c5, %c1_2] %7 = flow.dispatch.region[%5, %6] -> (tensor<2x5xf32>) { %9 = linalg.matmul ins(%0, %1 : tensor<2x10xf32>, tensor<10x5xf32>) outs(%4 : tensor<2x5xf32>) -> tensor<2x5xf32> %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9, %2 : tensor<2x5xf32>, tensor<5xf32>) outs(%3 : tensor<2x5xf32>) { ^bb0(%in: f32, %in_3: f32, %out: f32): %11 = arith.addf %in, %in_3 : f32 linalg.yield %11 : f32 } -> tensor<2x5xf32> flow.return %10 : tensor<2x5xf32> } count(%arg3: index, %arg4: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg3, %arg4 flow.return %x, %y, %z : index, index, index } %8 = hal.tensor.export %7 : tensor<2x5xf32> -> !hal.buffer_view return %8 : !hal.buffer_view} createFormDispatchWorkgroupsPass 将dispatch region转换成dispatch work group的形式,并将cloneable的op(比如tensor.fill、tensor.empty等)拷贝到work group中。如果在linalg层做了tiling,该pass也会把tiling引入的tensor.extract_slice和tensor.insert_slice尽可能转换成flow.tensor.slice和flow.tensor.update,转换不了的后续再转换成flow.dispatch.tensor.load和flow.dispatch.tensor.store。这里上一步的结果会被转换成, 1234567891011121314151617181920212223242526272829func.func @predict(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { %c2 = arith.constant 2 : index %c5 = arith.constant 5 : index %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<2x10xf32> %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<10x5xf32> %2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<5xf32> %3 = flow.dispatch.workgroups[%c2, %c5](%0, %1, %2) : (tensor<2x10xf32>, tensor<10x5xf32>, tensor<5xf32>) -> tensor<2x5xf32> = (%arg3: !flow.dispatch.tensor<readonly:tensor<2x10xf32>>, %arg4: !flow.dispatch.tensor<readonly:tensor<10x5xf32>>, %arg5: !flow.dispatch.tensor<readonly:tensor<5xf32>>, %arg6: !flow.dispatch.tensor<writeonly:tensor<2x5xf32>>) { %cst = arith.constant 0.000000e+00 : f32 %5 = flow.dispatch.tensor.load %arg3, offsets = [0, 0], sizes = [2, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10xf32>> -> tensor<2x10xf32> %6 = flow.dispatch.tensor.load %arg4, offsets = [0, 0], sizes = [10, 5], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<10x5xf32>> -> tensor<10x5xf32> %7 = flow.dispatch.tensor.load %arg5, offsets = [0], sizes = [5], strides = [1] : !flow.dispatch.tensor<readonly:tensor<5xf32>> -> tensor<5xf32> %8 = tensor.empty() : tensor<2x5xf32> %9 = linalg.fill ins(%cst : f32) outs(%8 : tensor<2x5xf32>) -> tensor<2x5xf32> %10 = linalg.matmul ins(%5, %6 : tensor<2x10xf32>, tensor<10x5xf32>) outs(%9 : tensor<2x5xf32>) -> tensor<2x5xf32> %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10, %7 : tensor<2x5xf32>, tensor<5xf32>) outs(%8 : tensor<2x5xf32>) { ^bb0(%in: f32, %in_0: f32, %out: f32): %12 = arith.addf %in, %in_0 : f32 linalg.yield %12 : f32 } -> tensor<2x5xf32> flow.dispatch.tensor.store %11, %arg6, offsets = [0, 0], sizes = [2, 5], strides = [1, 1] : tensor<2x5xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x5xf32>> flow.return } count(%arg3: index, %arg4: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg3, %arg4 flow.return %x, %y, %z : index, index, index } %4 = hal.tensor.export %3 : tensor<2x5xf32> -> !hal.buffer_view return %4 : !hal.buffer_view} createCaptureDispatchDynamicDimsPass 由于flow.dispatch.workgroups的参数中动态形状tensor被替换成了!flow.dispatch.tensor和相应的动态维度index,该pass捕获workgroups参数中的动态维度index,插入flow.dispatch.tie_shape将参数中的动态维度index和!flow.dispatch.tensor进行绑定。 12345678910111213141516171819202122232425262728293031// func.func @test(%arg0: tensor<?xf32>, %arg1: tensor<?xf32>) -> tensor<?xf32> {// %0 = mhlo.add %arg0, %arg1 : tensor<?xf32>// return %0 : tensor<?xf32>// }func.func @test(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { %c1 = arith.constant 1 : index %c0 = arith.constant 0 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<?xf32>{%0} %2 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index %3 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<?xf32>{%2} %4 = affine.apply affine_map<()[s0, s1, s2] -> ((s1 - s0) ceildiv s2)>()[%c0, %0, %c1] %5 = flow.dispatch.workgroups[%4](%0, %1, %3, %0, %2, %0) : (index, tensor<?xf32>{%0}, tensor<?xf32>{%2}, index, index, index) -> tensor<?xf32>{%0} = (%arg2: index, %arg3: !flow.dispatch.tensor<readonly:tensor<?xf32>>, %arg4: !flow.dispatch.tensor<readonly:tensor<?xf32>>, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<?xf32>>) { %7 = flow.dispatch.tensor.load %arg3, offsets = [0], sizes = [%arg7], strides = [1] : !flow.dispatch.tensor<readonly:tensor<?xf32>>{%arg7} -> tensor<?xf32> %8 = flow.dispatch.tensor.load %arg4, offsets = [0], sizes = [%arg6], strides = [1] : !flow.dispatch.tensor<readonly:tensor<?xf32>>{%arg6} -> tensor<?xf32> %9 = tensor.empty(%arg7) : tensor<?xf32> %10 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%7, %8 : tensor<?xf32>, tensor<?xf32>) outs(%9 : tensor<?xf32>) { ^bb0(%in: f32, %in_0: f32, %out: f32): %11 = arith.addf %in, %in_0 : f32 linalg.yield %11 : f32 } -> tensor<?xf32> flow.dispatch.tensor.store %10, %arg8, offsets = [0], sizes = [%arg7], strides = [1] : tensor<?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?xf32>>{%arg7} flow.return } count(%arg2: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg2 flow.return %x, %y, %z : index, index, index } %6 = hal.tensor.export %5 : tensor<?xf32>{%0} -> !hal.buffer_view return %6 : !hal.buffer_view} 会被转换成, 123456789101112131415161718192021222324252627282930func.func @test(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { %c1 = arith.constant 1 : index %c0 = arith.constant 0 : index %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index %1 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<?xf32>{%0} %2 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index %3 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<?xf32>{%2} %4 = affine.apply affine_map<()[s0, s1, s2] -> ((s1 - s0) ceildiv s2)>()[%c0, %0, %c1] %5 = flow.dispatch.workgroups[%4](%0, %1, %3, %0, %2, %0) : (index, tensor<?xf32>{%0}, tensor<?xf32>{%2}, index, index, index) -> tensor<?xf32>{%0} = (%arg2: index, %arg3: !flow.dispatch.tensor<readonly:tensor<?xf32>>, %arg4: !flow.dispatch.tensor<readonly:tensor<?xf32>>, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<?xf32>>) { %7 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?xf32>>{%arg7} %8 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<readonly:tensor<?xf32>>{%arg6} %9 = flow.dispatch.tie_shape %arg8 : !flow.dispatch.tensor<writeonly:tensor<?xf32>>{%arg7} %10 = flow.dispatch.tensor.load %7, offsets = [0], sizes = [%arg7], strides = [1] : !flow.dispatch.tensor<readonly:tensor<?xf32>>{%arg7} -> tensor<?xf32> %11 = flow.dispatch.tensor.load %8, offsets = [0], sizes = [%arg6], strides = [1] : !flow.dispatch.tensor<readonly:tensor<?xf32>>{%arg6} -> tensor<?xf32> %12 = tensor.empty(%arg7) : tensor<?xf32> %13 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%10, %11 : tensor<?xf32>, tensor<?xf32>) outs(%12 : tensor<?xf32>) { ^bb0(%in: f32, %in_0: f32, %out: f32): %14 = arith.addf %in, %in_0 : f32 linalg.yield %14 : f32 } -> tensor<?xf32> flow.dispatch.tensor.store %13, %9, offsets = [0], sizes = [%arg7], strides = [1] : tensor<?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?xf32>>{%arg7} flow.return } count(%arg2: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg2 flow.return %x, %y, %z : index, index, index } %6 = hal.tensor.export %5 : tensor<?xf32>{%0} -> !hal.buffer_view return %6 : !hal.buffer_view} mlir::createCanonicalizerPass createCSEPass createInitializeEmptyTensorsPass 如果tensor.empty op的user中存在非linalg或IREE LinalgExt op,则把该tensor.empty op转换成flow.tensor.empty或flow.tensor.splat op。 IREE::Flow::createOutlineDispatchRegionsPass 把每个dispatch region转换成flow.executable + flow.dispatch op。 1234567891011121314151617181920212223func.func @test(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { %c2 = arith.constant 2 : index %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<2xf32> %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<2xf32> %2 = flow.dispatch.workgroups[%c2](%0, %1) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32> = (%arg2: !flow.dispatch.tensor<readonly:tensor<2xf32>>, %arg3: !flow.dispatch.tensor<readonly:tensor<2xf32>>, %arg4: !flow.dispatch.tensor<writeonly:tensor<2xf32>>) { %4 = flow.dispatch.tensor.load %arg2, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32> %5 = flow.dispatch.tensor.load %arg3, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32> %6 = tensor.empty() : tensor<2xf32> %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%4, %5 : tensor<2xf32>, tensor<2xf32>) outs(%6 : tensor<2xf32>) { ^bb0(%in: f32, %in_0: f32, %out: f32): %8 = arith.addf %in, %in_0 : f32 linalg.yield %8 : f32 } -> tensor<2xf32> flow.dispatch.tensor.store %7, %arg4, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !flow.dispatch.tensor<writeonly:tensor<2xf32>> flow.return } count(%arg2: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg2 flow.return %x, %y, %z : index, index, index } %3 = hal.tensor.export %2 : tensor<2xf32> -> !hal.buffer_view return %3 : !hal.buffer_view} 转换成 12345678910111213141516171819202122232425262728flow.executable private @test_dispatch_0 { flow.executable.export public @test_dispatch_0_generic_2 workgroups(%arg0: index) -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0 flow.return %x, %y, %z : index, index, index}builtin.module { func.func @test_dispatch_0_generic_2(%arg0: !flow.dispatch.tensor<readonly:tensor<2xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<2xf32>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<2xf32>>) { %0 = flow.dispatch.tensor.load %arg0, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32> %1 = flow.dispatch.tensor.load %arg1, offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xf32>> -> tensor<2xf32> %2 = tensor.empty() : tensor<2xf32> %3 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0, %1 : tensor<2xf32>, tensor<2xf32>) outs(%2 : tensor<2xf32>) { ^bb0(%in: f32, %in_0: f32, %out: f32): %4 = arith.addf %in, %in_0 : f32 linalg.yield %4 : f32 } -> tensor<2xf32> flow.dispatch.tensor.store %3, %arg2, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !flow.dispatch.tensor<writeonly:tensor<2xf32>> return } }}func.func @test(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { %c2 = arith.constant 2 : index %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<2xf32> %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<2xf32> %2 = flow.dispatch @test_dispatch_0::@test_dispatch_0_generic_2[%c2](%0, %1) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32> %3 = hal.tensor.export %2 : tensor<2xf32> -> !hal.buffer_view return %3 : !hal.buffer_view} IREE::Util::createStripDebugOpsPass 消除DebugOnly op。 mlir::createCanonicalizerPass IREE::Flow::createDeduplicateExecutablesPass 消除重复的flow.executable。 IREE::Flow::createInjectDispatchTracingPass 注入跟踪运行时dispatch函数输入和输出信息的op。默认不开启。 IREE::Flow::createCleanupTensorShapesPass 删除flow.tensor.tie_shape op,并确认module中不再包含tensor.dim和tensor.rank 这两类形状查询op。 mlir::createCanonicalizerPass mlir::createCSEPass mlir::createCanonicalizerPass mlir::createCSEPass mlir::createSymbolDCEPass","categories":[{"name":"DL Compiler","slug":"DL-Compiler","permalink":"https://hjchen2.github.io/categories/DL-Compiler/"}],"tags":[{"name":"Deep Learning Compiler","slug":"Deep-Learning-Compiler","permalink":"https://hjchen2.github.io/tags/Deep-Learning-Compiler/"},{"name":"IREE","slug":"IREE","permalink":"https://hjchen2.github.io/tags/IREE/"}]},{"title":"IREE编译流程解析(三)","slug":"IREE编译流程3","date":"2023-01-04T12:30:12.000Z","updated":"2023-02-17T11:31:47.491Z","comments":true,"path":"2023/01/04/IREE编译流程3/","link":"","permalink":"https://hjchen2.github.io/2023/01/04/IREE%E7%BC%96%E8%AF%91%E6%B5%81%E7%A8%8B3/","excerpt":"IREE ABI::TransformPassPipeline主要作用是将外部导入的接口和本module导出到外部的接口参数统一成标准标量类型或hal.buffer_view类型(hal.buffer_view对应tensor),包含以下几个passes。","text":"IREE ABI::TransformPassPipeline主要作用是将外部导入的接口和本module导出到外部的接口参数统一成标准标量类型或hal.buffer_view类型(hal.buffer_view对应tensor),包含以下几个passes。 createWrapEntryPointsPass 给external func生成一个内部函数,函数中调用原始的external func,同时将public func的函数体包装成一个新的函数,原public func中调用该函数。该pass最终的目的是将外部导入的接口和本module导出到外部的接口参数统一成标准标量类型或hal.buffer_view(hal.buffer_view对应tensor类型)。 12345678// external/imported funcfunc.func private @add(tensor<f32>, tensor<f32>) -> tensor<f32>// public/exported funcfunc.func @test(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> { %0 = call @add(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32> return %0 : tensor<f32>} 转换成, 12345678910111213141516171819func.func private @add(!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub}func.func private @_add(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> { %0 = hal.tensor.export %arg0 : tensor<f32> -> !hal.buffer_view %1 = hal.tensor.export %arg1 : tensor<f32> -> !hal.buffer_view %2 = call @add(%0, %1) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view %3 = hal.tensor.import %2 : !hal.buffer_view -> tensor<f32> return %3 : tensor<f32>}func.func @test(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<f32> %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<f32> %2 = call @_test(%0, %1) : (tensor<f32>, tensor<f32>) -> tensor<f32> %3 = hal.tensor.export %2 : tensor<f32> -> !hal.buffer_view return %3 : !hal.buffer_view}func.func private @_test(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> { %0 = call @_add(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32> return %0 : tensor<f32>} mlir::createInlinerPass 将WrapEntryPointsPass中生成的wrap函数内联起来。最终转换成, 12345func.func private @add(!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub}func.func @test(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { %0 = call @add(%arg0, %arg1) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view return %0 : !hal.buffer_view} mlir::createCanonicalizerPass mlir::createCSEPass mlir::createSymbolDCEPass","categories":[{"name":"DL Compiler","slug":"DL-Compiler","permalink":"https://hjchen2.github.io/categories/DL-Compiler/"}],"tags":[{"name":"Deep Learning Compiler","slug":"Deep-Learning-Compiler","permalink":"https://hjchen2.github.io/tags/Deep-Learning-Compiler/"},{"name":"IREE","slug":"IREE","permalink":"https://hjchen2.github.io/tags/IREE/"}]},{"title":"IREE编译流程解析(二)","slug":"IREE编译流程2","date":"2023-01-04T12:20:12.000Z","updated":"2023-02-17T11:31:40.295Z","comments":true,"path":"2023/01/04/IREE编译流程2/","link":"","permalink":"https://hjchen2.github.io/2023/01/04/IREE%E7%BC%96%E8%AF%91%E6%B5%81%E7%A8%8B2/","excerpt":"IREE CommonInputConversionPassPipeline主要作用是将IREE::Input dialect lower成IREE::Util、IREE::Flow和IREE::HAL dialect,包括以下几个passes。","text":"IREE CommonInputConversionPassPipeline主要作用是将IREE::Input dialect lower成IREE::Util、IREE::Flow和IREE::HAL dialect,包括以下几个passes。 createIREEImportPublicPass 将IREE::Input dialect转换成IREE::Util、IREE::Flow和IREE::HAL dialect,并转换func的属性和signature中输入输出类型。比如, 1234567iree_input.global private mutable @param : tensor<1x2xf32>func.func @run(%arg0: tensor<1x2xf32>) { %0 = iree_input.global.load @param : tensor<1x2xf32> %1 = iree_input.tensor.clone %0 : tensor<1x2xf32> iree_input.global.store %1, @param : tensor<1x2xf32> return} 转换成(iree_input.global.load->util.global.load,iree_input.global.store->util.global.store,iree_input.tensor.clone->flow.tensor.clone): 1234567util.global private mutable @param : tensor<1x2xf32>func.func @run(%arg0: tensor<1x2xf32>) { %param = util.global.load @param : tensor<1x2xf32> %0 = flow.tensor.clone %param : tensor<1x2xf32> util.global.store %0, @param : tensor<1x2xf32> return} createImportMLProgramPass 将ml_program dialect转换到IREE::Util dialect。 createSanitizeModuleNamesPass 将module name中的.替换为_,以符合mlir identifiers的命名规范。 123456module @iree.module { func.func @test(%arg0: f32, %arg1: f32) -> f32 { %0 = arith.addf %arg0, %arg1 : f32 return %0 : f32 }} 转换成 123456module @iree_module { func.func @test(%arg0: f32, %arg1: f32) -> f32 { %0 = arith.addf %arg0, %arg1 : f32 return %0 : f32 }}","categories":[{"name":"DL Compiler","slug":"DL-Compiler","permalink":"https://hjchen2.github.io/categories/DL-Compiler/"}],"tags":[{"name":"Deep Learning Compiler","slug":"Deep-Learning-Compiler","permalink":"https://hjchen2.github.io/tags/Deep-Learning-Compiler/"},{"name":"IREE","slug":"IREE","permalink":"https://hjchen2.github.io/tags/IREE/"}]},{"title":"IREE编译流程解析(一)","slug":"IREE编译流程1","date":"2023-01-04T12:14:12.000Z","updated":"2023-02-21T12:50:30.443Z","comments":true,"path":"2023/01/04/IREE编译流程1/","link":"","permalink":"https://hjchen2.github.io/2023/01/04/IREE%E7%BC%96%E8%AF%91%E6%B5%81%E7%A8%8B1/","excerpt":"IREE InputConversionPassPipeline的主要作用是将不同的输入(MHLO、XLA、Torch Tensor和TOSA)统一lower成linalg dialect和builtin的arith dialect、scf dialect和tensor dialect。下面以MHLO输入为例,列举了InputConversionPassPipeline中各个pass以及它们的主要作用。","text":"IREE InputConversionPassPipeline的主要作用是将不同的输入(MHLO、XLA、Torch Tensor和TOSA)统一lower成linalg dialect和builtin的arith dialect、scf dialect和tensor dialect。下面以MHLO输入为例,列举了InputConversionPassPipeline中各个pass以及它们的主要作用。 mhlo::createLegalizeControlFlowPass 将TF1.0中的控制流原语(http://download.tensorflow.org/paper/white_paper_tf_control_flow_implementation_2017_11_1.pdf )规范化成HLO中的控制流算子。 createTopLevelSCFToCFGPass 将顶层的structured control flow表示的控制流图转换成更底层基础块的控制流图(CFG)。 createMHLOToMHLOPreprocessingPass mlir::createCanonicalizerPass mlir::createShapeToShapeLowering 将 shape.num_elements 转换成 shape.reduce。 mlir::createConvertShapeToStandardPass 将shape dialect lower成arith dialect、scf dialect和tensor dialect。比如 12345678func.func @test(%arg0: tensor<1x?xf32>, %arg1: tensor<?xf32>) -> index { %c1 = arith.constant 1 : index %c0 = arith.constant 0 : index %0 = shape.dim %arg0, %c1 : tensor<1x?xf32>, index -> index %1 = shape.dim %arg1, %c0 : tensor<?xf32>, index -> index %2 = shape.add %0, %1 : index, index -> index return %2 : index} 转换成 1234567891011121314151617func.func @test(%arg0: tensor<1x?xf32>, %arg1: tensor<?xf32>) -> index { %c1 = arith.constant 1 : index %c0 = arith.constant 0 : index %c1_0 = arith.constant 1 : index %c1_1 = arith.constant 1 : index %0 = tensor.dim %arg0, %c1_1 : tensor<1x?xf32> %1 = tensor.from_elements %c1_0, %0 : tensor<2xindex> %2 = tensor.cast %1 : tensor<2xindex> to tensor<2xindex> %3 = tensor.dim %arg0, %c1 : tensor<1x?xf32> %c0_2 = arith.constant 0 : index %4 = tensor.dim %arg1, %c0_2 : tensor<?xf32> %5 = tensor.from_elements %4 : tensor<1xindex> %6 = tensor.cast %5 : tensor<1xindex> to tensor<1xindex> %7 = tensor.dim %arg1, %c0 : tensor<?xf32> %8 = arith.addi %3, %7 : index return %8 : index } mlir::createCanonicalizerPass mlir::createInlinerPass 内联calls和callable operations,并删除dead callables。比如: 12345678func.func @test(%arg0: tensor<1xf32>, %arg1: tensor<1xf32>) -> tensor<1xf32> { %0 = call @add(%arg0, %arg1) : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32> return %0 : tensor<1xf32>}func.func private @add(%arg0: tensor<1xf32>, %arg1: tensor<1xf32>) -> tensor<1xf32> { %0 = mhlo.add %arg0, %arg1 : tensor<1xf32> return %0 : tensor<1xf32>} 私有的add函数被内联之后删除, 1234func.func @test(%arg0: tensor<1xf32>, %arg1: tensor<1xf32>) -> tensor<1xf32> { %0 = mhlo.add %arg0, %arg1 : tensor<1xf32> return %0 : tensor<1xf32>} IREE::Util::createDemoteI64ToI32Pass IREE::Util::createDemoteF64ToF32Pass mlir::createCanonicalizerPass mlir::createCSEPass mhlo::createLegalizeShapeComputationsPass 把scalar tensor op转换成scalar op + fromElements op。比如 123456func.func @test(%arg0: f32, %arg1: f32) -> tensor<1xf32> { %0 = tensor.from_elements %arg0 : tensor<1xf32> %1 = tensor.from_elements %arg1 : tensor<1xf32> %2 = mhlo.add %0, %1 : tensor<1xf32> return %2 : tensor<1xf32>} 转换成: 12345func.func @test(%arg0: f32, %arg1: f32) -> tensor<1xf32> { %0 = arith.addf %arg0, %arg1 : f32 %1 = tensor.from_elements %0 : tensor<1xf32> return %1 : tensor<1xf32>} createConvertMHLOToLinalgExtPass 将mhlo::sort、mhlo.scatter、mhlo.fft、mhlo.reverse、mhlo.topk转换到IREE::LinalgExt dialect,同时将在IREE::LinalgExt dialect区域内部的mhlo op转换成linalg dialect,mhlo.return则转换成iree_linalg_ext.yield。比如, 12345678func.func @test(%arg0: tensor<10xf32>) -> tensor<10xf32> { %0 = "mhlo.sort"(%arg0) ({ ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>): %1 = mhlo.compare GT, %arg1, %arg2 : (tensor<f32>, tensor<f32>) -> tensor<i1> mhlo.return %1 : tensor<i1> }) {dimension = 0 : i64} : (tensor<10xf32>) -> tensor<10xf32> return %0 : tensor<10xf32>} 转换成, 12345678func.func @test(%arg0: tensor<10xf32>) -> tensor<10xf32> { %0 = iree_linalg_ext.sort dimension(0) outs(%arg0 : tensor<10xf32>) { ^bb0(%arg1: f32, %arg2: f32): %1 = arith.cmpf ogt, %arg1, %arg2 : f32 iree_linalg_ext.yield %1 : i1 } -> tensor<10xf32> return %0 : tensor<10xf32>} createMHLOToLinalgOnTensorsPass 将外层剩余的mhlo op转换到linalg dialect。比如 1234func.func @test(%arg0: tensor<1xf32>, %arg1: tensor<1xf32>) -> tensor<1xf32> { %0 = mhlo.add %arg0, %arg1 : tensor<1xf32> return %0 : tensor<1xf32>} 转换成, 123456789func.func @test(%arg0: tensor<1xf32>, %arg1: tensor<1xf32>) -> tensor<1xf32> { %0 = linalg.init_tensor [1] : tensor<1xf32> %1 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<1xf32>, tensor<1xf32>) outs(%0 : tensor<1xf32>) { ^bb0(%arg2: f32, %arg3: f32, %arg4: f32): %2 = arith.addf %arg2, %arg3 : f32 linalg.yield %2 : f32 } -> tensor<1xf32> return %1 : tensor<1xf32>} mlir::createReconcileUnrealizedCastsPass 消除unrealized conversion cast操作。算法过程描述:如果unrealized conversion cast是dead节点(没有user或所有users也都是unrealized conversion cast),则直接删除该dead节点;如果是live节点(至少有一个非unrealized conversion cast的user),则遍历其所有子节点,如果其子节点中所有unrealized conversion cast的result type与该op的input type相同(即不存在真实意义的type cast操作),则将所有遍历到的unrealized conversion cast都折叠成该op的输入,否则报错live unrealized conversion cast。 mlir::createCanonicalizerPass createVerifyCompilerMHLOInputLegality 验证program是否合法。","categories":[{"name":"DL Compiler","slug":"DL-Compiler","permalink":"https://hjchen2.github.io/categories/DL-Compiler/"}],"tags":[{"name":"Deep Learning Compiler","slug":"Deep-Learning-Compiler","permalink":"https://hjchen2.github.io/tags/Deep-Learning-Compiler/"},{"name":"IREE","slug":"IREE","permalink":"https://hjchen2.github.io/tags/IREE/"}]},{"title":"IREE编译流程解析","slug":"IREE编译流程","date":"2023-01-04T04:00:04.000Z","updated":"2023-02-24T12:36:50.764Z","comments":true,"path":"2023/01/04/IREE编译流程/","link":"","permalink":"https://hjchen2.github.io/2023/01/04/IREE%E7%BC%96%E8%AF%91%E6%B5%81%E7%A8%8B/","excerpt":"IREE目前支持将MHLO或XLA、Torch Tensor和TOSA作为输入,经过一系列passes编译生成IREE定义的VM bytecode中间产物,其中硬件相关代码会编译成相应的Executable,保存在VM bytecode中供host进行调用,比如CUDA相关的计算代码会被lower成PTX代码,在IREE的runtime中再被CUDA的运行时以JIT的方式编译成可执行的cubin kernel。","text":"IREE目前支持将MHLO或XLA、Torch Tensor和TOSA作为输入,经过一系列passes编译生成IREE定义的VM bytecode中间产物,其中硬件相关代码会编译成相应的Executable,保存在VM bytecode中供host进行调用,比如CUDA相关的计算代码会被lower成PTX代码,在IREE的runtime中再被CUDA的运行时以JIT的方式编译成可执行的cubin kernel。 IREE编译的入口是IREEVMTransformPassPipeline,IREEVMTransformPassPipeline又被分成InputConversionPassPipeline、CommonInputConversionPassPipeline、ABI::TransformPassPipeline、Flow::FlowTransformPassPipeline、Stream::StreamTransformPassPipeline(仅CUDA后端)、HAL::HALTransformPassPipeline、VM::VMTransformPassPipeline等几个阶段。 InputConversionPassPipeline IREE编译流程解析(一) 主要作用是将不同的输入(MHLO或XLA、Torch Tensor和TOSA)统一lower成linalg dialect和builtin的arith dialect、scf dialect和tensor dialect。 CommonInputConversionPassPipeline IREE编译流程解析(二) 主要作用是将IREE::Input dialect lower成IREE::Util、IREE::Flow和IREE::HAL dialect。 ABI::TransformPassPipeline IREE编译流程解析(三) 主要作用是将外部导入的接口和本module导出到外部的接口参数统一成标准标量类型或hal.buffer_view类型(hal.buffer_view对应tensor)。 Flow::FlowTransformPassPipeline IREE编译流程解析(四) 主要作用是执行一系列窥孔优化,比如1x1的conv2d转换成matmul、tiling、op fusion等,最终将workload拆分成flow.executable。 Stream::StreamTransformPassPipeline IREE编译流程解析(五) 主要作用是将program转换到stream dialect,优化变量编码方式,划分调度子图,生成异步调度策略,并实现内存规划策略。 HAL::HALTransformPassPipeline IREE编译流程解析(六) 主要作用是进行tiling、vectorization和bufferization等操作,分配计算负载,最终生成target device的代码。比如cuda target的dispatch source code会被递降为NVVM IR。 VM::VMTransformPassPipeline","categories":[{"name":"DL Compiler","slug":"DL-Compiler","permalink":"https://hjchen2.github.io/categories/DL-Compiler/"}],"tags":[{"name":"Deep Learning Compiler","slug":"Deep-Learning-Compiler","permalink":"https://hjchen2.github.io/tags/Deep-Learning-Compiler/"},{"name":"IREE","slug":"IREE","permalink":"https://hjchen2.github.io/tags/IREE/"}]},{"title":"如何在XRT框架下添加自定义的后端引擎","slug":"如何在XRT框架下添加自定义的后端引擎","date":"2020-02-25T08:06:18.000Z","updated":"2023-02-07T02:39:00.722Z","comments":true,"path":"2020/02/25/如何在XRT框架下添加自定义的后端引擎/","link":"","permalink":"https://hjchen2.github.io/2020/02/25/%E5%A6%82%E4%BD%95%E5%9C%A8XRT%E6%A1%86%E6%9E%B6%E4%B8%8B%E6%B7%BB%E5%8A%A0%E8%87%AA%E5%AE%9A%E4%B9%89%E7%9A%84%E5%90%8E%E7%AB%AF%E5%BC%95%E6%93%8E/","excerpt":"XRT为不同的后端引擎提供了统一的上层功能和接口抽象,这些功能和接口包括: 统一的DAG计算图表示 统一的子图表达、切分和折叠过程 统一的JIT子图编译接口和缓存机制 统一的Executable Launch接口 得益于上层统一的抽象和模块化的设计,后端引擎只需要处理一些差异化的接口,并且这些差异化通常只体现在子图的编译和executable launch接口的具体实现上。","text":"XRT为不同的后端引擎提供了统一的上层功能和接口抽象,这些功能和接口包括: 统一的DAG计算图表示 统一的子图表达、切分和折叠过程 统一的JIT子图编译接口和缓存机制 统一的Executable Launch接口 得益于上层统一的抽象和模块化的设计,后端引擎只需要处理一些差异化的接口,并且这些差异化通常只体现在子图的编译和executable launch接口的具体实现上。 我们把XRT的每个子图都看成是一个function,function包含输入和输出参数,以及对应的函数体(DAG表示的计算图),比如下面表示的是只包含一个relu节点的XRT子图,其中node表示计算节点,input和output分别表示子图的输入和输出。 1234567891011121314151617181920212223242526272829function { input { name: "_xrt_entry_0" value: "_MyGraph_0_input.0.0_2/out" } output { name: "_xrt_return_0" value: "relu-0/y_0" } node { name: "relu-0" device_tag: "cuda" user_conf { op_type_name: "relu" input { key: "x" value { s: "_MyGraph_0_input.0.0_2/out" } } output { key: "y" value { s: "relu-0/y_0" } } } }} 在runtime阶段function首先需要被编译成executable,执行function实际上就是feed对应的输入参数去launch这个编译好的executable,同时得到执行的结果,即function的返回值。 在XRT框架下每个后端引擎都有一个与之相对应的executable(比如XLA的XlaExecutable和TensorRT的TrtExecutable),和将function编译成对应executable的compiler(比如XLA的XlaGraphCompiler和TensorRT的TrtGraphCompiler),因此添加一个新的后端引擎,通常只需要添加一个对应的executable和compiler。下面以添加一个自定义的后端引擎Toy为例,详细介绍在XRT框架下支持新的后端引擎的具体过程。 首先在xrt.proto文件中XrtEngine下增加一个Toy引擎字段。 1234567enum XrtEngine { DEFAULT = 1; XLA = 2; TENSORRT = 3; TVM = 4; TOY = 5; // For Toy engine} 如果Toy引擎针对的硬件不在XrtDevice中,则需要在XrtDevice中增加对应的设备字段。这里我们假设自定义的Toy引擎只支持GPU_CUDA,因此就不需要修改XrtDevice了。 接下来,与XLA和TensorRT一样,我们在oneflow_xrt/compiler目录下创建一个toy目录,其余所有与Toy引擎相关的代码都将放在该目录下。 Toy Executable 在增加任何一个后端引擎之前,我们都需要仔细考虑该后端引擎所需的最小执行环境,一个最简单的执行环境包括输入输出、中间结果以及执行具体计算逻辑的硬件代码,这个代码可以是通过codegen自动生成的,也可以是手工实现的。 接下来我们给自定义的Toy引擎增加一个对应的ToyExecutable。在oneflow_xrt/compiler/toy目录下,我们创建文件toy_executable.h和toy_executable.cpp。 toy_executable.h中定义ToyExecutable,ToyExecutable必须继承自Executable,并实现Run接口。为了尽可能简单,ToyExecutable只包含输出outputs、中间结果tmp_buffers和编排好的函数调用列表func_codes,以及每个函数的输入输出参数对应的buffer序号func_args_。 12345678910111213141516171819202122232425262728293031323334353637383940414243#ifndef ONEFLOW_XRT_COMPILER_TOY_TOY_EXECUTABLE_H_#include "oneflow_xrt/compiler/executable.h"#include "oneflow_xrt/compiler/parameter.h"#include <vector>#include <functional>namespace oneflow {namespace xrt {typedef std::function<void(const std::vector<Parameter> &, const std::vector<Parameter> &)> FuncCode;struct FuncArgumentIndices { std::vector<int> inputs; std::vector<int> outputs;};class ToyExecutable : public Executable { public: ToyExecutable(const std::string &name, const int num_inputs, const std::vector<Parameter> &outputs, const std::vector<Parameter> &temp_buffers, const std::vector<FuncCode> &func_codes, const std::vector<FuncArgumentIndices> &func_args); bool Run(const std::vector<Parameter> &inputs, const ExecutableRunOptions &run_options, bool block_until_done = true) override; private: int num_inputs_; std::vector<Parameter> outputs_; std::vector<Parameter> temp_buffers_; std::vector<FuncCode> func_codes_; std::vector<FuncArgumentIndices> func_args_;};} // namespace xrt} // namespace oneflow#endif // ONEFLOW_XRT_COMPILER_TOY_TOY_EXECUTABLE_H_ 在toy_executable.cpp中实现Run方法,这里我们只是简单的顺序执行编排好的函数func_codes。 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657#include "oneflow_xrt/compiler/toy/toy_executable.h"namespace oneflow {namespace xrt {ToyExecutable::ToyExecutable(const std::string &name, const int num_inputs, const std::vector<Parameter> &outputs, const std::vector<Parameter> &temp_buffers, const std::vector<FuncCode> &func_codes, const std::vector<FuncArgumentIndices> &func_args) : Executable(name, XrtEngine::TOY), num_inputs_(num_inputs), outputs_(outputs), temp_buffers_(temp_buffers), func_codes_(func_codes), func_args_(func_args) {}bool ToyExecutable::Run(const std::vector<Parameter> &inputs, const ExecutableRunOptions &run_options, bool block_until_done) { auto PullArgs = [&](const std::vector<int> &indices) { std::vector<Parameter> args; for (int idx : indices) { if (idx < num_inputs_) { args.push_back(inputs[idx]); } else if (idx < num_inputs_ + outputs_.size()) { args.push_back(outputs_[idx - num_inputs_]); } else { idx -= (num_inputs_ + outputs_.size()); CHECK_GE(idx, 0); CHECK_LT(idx, temp_buffers_.size()); args.push_back(temp_buffers_[idx]); } } return std::move(args); }; CHECK_EQ(inputs.size(), num_inputs_); for (int i = 0; i < func_codes_.size(); ++i) { auto in_args = PullArgs(func_args_[i].inputs); auto out_args = PullArgs(func_args_[i].outputs); func_codes_[i](in_args, out_args); } // Synchronize stream if block_until_done if (block_until_done) { // TODO() } // All return params are the results of the executable this->results_ = run_options.return_params; return true /*running status*/;}} // namespace xrt} // namespace oneflow 目前为止我们已经完成了一个最简单的运行时executable,这个executable甚至有点类似其他框架中提供的最简单的图执行器(graph executor)。接下来我们要介绍如何将一个XRT的子图编译成上面的ToyExecutable。 Toy Compiler 每个后端引擎都对应一个compiler,当我们希望使用某个后端引擎来执行一个XRT子图时,就需要有一个对应的compiler将该子图编译成后端引擎对应的executable。Compiler通常都非常注重编译产物的执行性能,而性能以外的关切点也导致了不同的技术路线,比如对算法通用性、跨平台有高度关切的TVM和XLA采用了LLVM传统编译器的路线,而对于过分看重性能但硬件平台单一的TensorRT更多的则是采用手工优化和tuning相结合的策略。不过这两种技术路线并不是完全对立的,也是在不断地相互借鉴和融合。 在XRT中,所有这些技术方案都是可以被兼容的,你可以根据实际情况自由切换,你也可以把XRT当成实验场所,实现一个自定义的compiler,并在同一套框架下对比不同compiler、不同技术方案的优劣。 回到本文的主题,我们现在需要实现一个ToyExecutable对应的compiler,我们也把该compiler叫做ToyGraphCompiler。 首先在oneflow_xrt/compiler/toy目录下新建两个文件toy_graph_compiler.h和toy_graph_compiler.cpp。在toy_graph_compiler.h文件中定义类ToyGraphCompiler,ToyGraphCompiler必须继承自类GraphCompiler::Impl,并实现对应的Compile接口。 12345678910111213class ToyGraphCompiler : public GraphCompiler::Impl { public: explicit ToyGraphCompiler(const std::string &name) : GraphCompiler::Impl(name) {} virtual ~ToyGraphCompiler() = default; std::shared_ptr<Executable> Compile( const XrtGraph *graph, const std::vector<Parameter> &entry_params, const std::vector<Parameter> &return_params, const std::vector<InputOutputAlias> &aliases) override;}; 在toy_graph_compiler.cpp中实现Compile接口,并注册一个新的graph compiler。在动手实现该接口之前,有必要先解释一下该接口的参数列表,graph表示的是function子图,entry_params表示子图的输入,return_params表示子图的输出,aliases通常在包含模型更新操作时会用到,表明输出和输入是一对别名关系。被alias的输入将生命期延长到了整个子图,并且与对应的输出共享内存,因此也就间接实现了inplace计算的目的。 我们按拓扑顺序遍历子图中的每个节点(或op),依次将节点编译成具体的执行代码,并在合适的位置插入临时buffer。为了方便处理不同类型的op,我们在下面的代码中引入了ToyOpContext和ToyOpKernel的概念。 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758// Register a new graph compiler for TOY engine.REGISTER_GRAPH_COMPILER(XrtEngine::TOY, ToyGraphCompiler);// Realize Compile interface.std::shared_ptr<Executable> ToyGraphCompiler::Compile( const XrtGraph *graph, const std::vector<Parameter> &entry_params, const std::vector<Parameter> &return_params, const std::vector<InputOutputAlias> &aliases) { std::vector<Parameter> temp_buffers; std::vector<FuncCode> func_codes; std::vector<FuncArgumentIndices> func_args; std::unordered_map<std::string, int> indices; std::unordered_map<std::string, Parameter> all_params; for (auto param : entry_params) { indices.emplace(param.name(), indices.size()); all_params[param.name()] = param; } for (auto param : return_params) { indices.emplace(param.name(), indices.size()); all_params[param.name()] = param; } algorithm::TopologyVisit(*graph, [&](const XrtNode *node) { if (node->IsNoOpNode()) { // NoOp node is not computation node, so skip it return; } ToyOpContext op_context(node, all_params); auto op_kernel = BuildToyOpKernel(node->type()); op_kernel->Compile(&op_context); func_codes.push_back(op_context.func_code_); const auto &buffers = op_context.tmp_buffers_; for (auto it = buffers.begin(); it != buffers.end(); ++it) { all_params[it->first] = it->second; temp_buffers.push_back(it->second); indices.emplace(it->first, indices.size()); } // Finalize argument indices for each function FuncArgumentIndices arg_indices; for (const auto &arg : op_context.input_args_) { arg_indices.inputs.push_back(indices.at(arg)); } for (const auto &arg : op_context.output_args_) { arg_indices.outputs.push_back(indices.at(arg)); } func_args.push_back(std::move(arg_indices)); }); return std::make_shared<ToyExecutable>(this->name_, entry_params.size(), return_params, temp_buffers, func_codes, func_args);} ToyOpContext临时存储编译需要的元信息和编译结果,为ToyOpKernel提供必要的接口,ToyOpKernel则根据op类型完成单个op的编译过程。上述代码中我们实现了一个将XRT子图编译成ToyExecutable的最简单的graph compiler,下面我们将以ReLU op为例,介绍ToyOpContext和ToyOpKernel是如何对op进行编译的。 Toy Kernels 我们回过头再仔细研究一下ToyGraphCompiler的Compile实现,ToyOpContext接受两个输入,node和当前所有已经创建过的parameters,经过OpKernel编译后输出函数代码(func_code_)、中间buffer(tmp_buffers_),以及函数代码输入和输出对应的parameter names。因此在这个例子中,ToyOpContext被设计成如下形式: 12345678910111213141516class ToyOpContext { public: ToyOpContext(const XrtNode *node, const std::unordered_map<std::string, Parameter> &all_params) : node_(node), all_params_(all_params) {} public: const XrtNode *node_; const std::unordered_map<std::string, Parameter> &all_params_; std::function<void(const std::vector<Parameter>&, const std::vector<Parameter>&)> func_code_; std::vector<std::string> input_args_; std::vector<std::string> output_args_; std::unordered_map<std::string, Parameter> tmp_buffers_;}; 对于ToyOpKernel,为了处理不同类型的op,我们采用工厂注册模式,并且这种模式还有另一个用处,就是在XRT划分子图时可以用来判断该引擎是否支持某个类型的op。XRT已经将kernel注册接口封装成了一个辅助类OpKernelRegistrar,但同时也要求ToyOpKernel必须继承基类OpKernel。 1234class ToyOpKernel : public OpKernel<ToyOpContext> { public: virtual void Compile(ToyOpContext *ctx) = 0;}; 使用OpKernelRegistrar定义一个用来注册ToyOpKernel的宏。 12345678#define REGISTER_TOY_OP_KERNEL(OpName, KernelType) \\ static auto _toy_op_kernel_##OpName##_ __attribute__((unused)) = \\ OpKernelRegistrar(#OpName) \\ .SetEngine(XrtEngine::TOY) \\ .SetDevice({XrtDevice::GPU_CUDA}) \\ .SetFactory([]() -> OpKernelBase * { \\ return new KernelType; \\ }) 最后我们实现一个Relu的OpKernel,填充ToyOpContext的func_code_、tmp_buffers_以及输入输出arguments。 1234567891011121314151617181920212223242526272829303132333435void ComputeRelu(const Parameter &input, const Parameter &output) { //TODO(hjchen2)}class ToyReluOpKernel : public ToyOpKernel { public: void Compile(ToyOpContext *ctx) override { ctx->func_code_ = [](const std::vector<Parameter> &inputs, const std::vector<Parameter> &outputs) { CHECK_EQ(inputs.size(), 1); CHECK_EQ(outputs.size(), 1); ComputeRelu(inputs[0], outputs[0]); }; for (const XrtEdge *edge : ctx->node_->in_edges()) { const auto &name = edge->argument().name(); CHECK_GT(ctx->all_params_.count(name), 0); // TODO(): Filter duplicate input ctx->input_args_.push_back(name); } for (const XrtEdge *edge : ctx->node_->out_edges()) { const auto &name = edge->argument().name(); // TODO(): Filter duplicate output ctx->output_args_.push_back(name); if (ctx->all_params_.count(name) == 0 && ctx->tmp_buffers_.count(name) == 0) { auto param = CreateParameter(name /*argument name*/, edge->argument().shape(), edge->argument().data_type()); ctx->tmp_buffers_[name] = std::move(param); } } }}; 最后将ToyReluOpKernel注册到Toy引擎对应的OpKernel工厂下。 123REGISTER_TOY_OP_KERNEL(relu, ToyReluOpKernel) .EnableTrainPhase() .Finalize(); EnableTrainPhase表示该op支持训练,OpKernelRegistrar也提供了其他一些接口,比如设置支持的device列表,mutable variables(inplace更新)和是否是model update op(model update op会影响子图划分)。 CMake编译 在CMakeList.txt中添加一个BUILD_TOY的选项,并在oneflow_xrt/CMakeLists.txt中添加如下toy引擎模块的编译代码, 12345678910111213if(BUILD_TOY) file(GLOB_RECURSE XRT_TOY_SRCS compiler/toy/*.cpp) add_library(oneflow_xrt_toy ${XRT_TOY_SRCS}) add_dependencies( oneflow_xrt_toy ${XRT_THIRD_PARTY_LIBRARIES}) target_link_libraries( oneflow_xrt_toy oneflow_xrt ${XRT_THIRD_PARTY_LIBRARIES}) target_include_directories( oneflow_xrt_toy PRIVATE ${ONEFLOW_INCLUDE_DIR})endif() 之后在oneflow_xrt/python目录中添加导出Python模块的代码toy_stub.cpp, 1234#include <pybind11/pybind11.h>#include <pybind11/stl.h>PYBIND11_MODULE(_oneflow_xrt_toy_internal, m) {} 并在oneflow_xrt/python/CMakeLists.txt中增加如下代码, 123if(BUILD_TOY) oneflow_xrt_add_stub(oneflow_xrt_toy toy_stub.cpp)endif() 编译和安装Python wheel包 修改setup.py文件,新增一个toy extension的编译,并在build_ext函数中开启BUILD_TOY选项, 12345setup_extension( "oneflow_xrt_toy", cmake_args=["-DBUILD_TOY=ON"], description=("oneflow_xrt's toy extension"),) 执行命令python3 setup.py install完成wheel包的编译和安装,最后执行如下代码测试添加的toy引擎是否可以正常执行, 12345678910111213import oneflow as flowimport oneflow_xrt as flowrtclass ReluGraph(flow.nn.Graph): def __init__(self): super().__init__() def build(self, input): return flow.nn.functional.relu(input)m = flowrt.XRTModule(ReluGraph(), engine="toy")x = flow.randn(2, 3, device="cuda")print(m(x))","categories":[{"name":"XRT","slug":"XRT","permalink":"https://hjchen2.github.io/categories/XRT/"}],"tags":[{"name":"XRT","slug":"XRT","permalink":"https://hjchen2.github.io/tags/XRT/"},{"name":"Compiler","slug":"Compiler","permalink":"https://hjchen2.github.io/tags/Compiler/"},{"name":"TensorFlow XLA","slug":"TensorFlow-XLA","permalink":"https://hjchen2.github.io/tags/TensorFlow-XLA/"},{"name":"TensorRT","slug":"TensorRT","permalink":"https://hjchen2.github.io/tags/TensorRT/"}]},{"title":"TVM PackedFunc实现机制","slug":"TVM-PackedFunc实现机制","date":"2020-01-10T04:24:08.000Z","updated":"2023-02-07T02:41:00.291Z","comments":true,"path":"2020/01/10/TVM-PackedFunc实现机制/","link":"","permalink":"https://hjchen2.github.io/2020/01/10/TVM-PackedFunc%E5%AE%9E%E7%8E%B0%E6%9C%BA%E5%88%B6/","excerpt":"TVM PackedFunc实现 为了便于Python和C++混合编程,TVM使用了统一的PackedFunc机制。PackedFunc可以将C++中的各类函数打包成统一的函数接口,并自动导出到Python模块中进行调用,并且也支持从Python中注册一个函数,并伪装成PackedFunc在C++和Python中调用。","text":"TVM PackedFunc实现 为了便于Python和C++混合编程,TVM使用了统一的PackedFunc机制。PackedFunc可以将C++中的各类函数打包成统一的函数接口,并自动导出到Python模块中进行调用,并且也支持从Python中注册一个函数,并伪装成PackedFunc在C++和Python中调用。 预备知识 Python ctypes混合编程 ctypes是Python自带的跨语言函数调用库,ctypes提供了简单的C数据类型,可以将C/C++动态库中的函数包装成Python函数进行调用。 导出C++函数 首先在C++中定义一个全局函数,并编译生成C++动态库。 1234// test.hextern "C" {int add(int a, int b);} 12345// test.cc#include "test.h"int add(int a, int b) { return a + b;} 用ctypes模块在Python中加载生成的动态库(test.so),并调用C++中的函数。 1234567891011import ctypes# Load shared library_LIB = ctypes.CDLL("./test.so", ctypes.RTLD_GLOBAL)a = ctypes.c_int(1)b = ctypes.c_int(2)# Call C func in Pythonprint(_LIB.add(a, b))# Orprint(_LIB.add(1, 2)) 传递Python函数到C++ ctypes也支持将Python函数转换成C类型的函数,并在C/C++中进行调用。 12def add(a, b): return a + b Python add有两个参数a和b,返回值类型与a和b的类型一致。在C++中可以为Python add定义一个函数原型 int(int, int)。 1234extern "C" {typedef int (*PyCFunc)(int, int);int call_py_func(PyCFunc f, int a, int b);} 1234#include "test.h"int call_py_func(PyCFunc f, int a, int b) { return f(a, b);} 使用ctypes将Python函数转换成C function,传入C++中进行调用。 1234567891011121314import ctypescfunc = ctypes.CFUNCTYPE( ctypes.c_int, # return type ctypes.c_int, # arg0 type ctypes.c_int # arg1 type )f = cfunc(add)# CFUNCTYPE is callable in Pythonprint(f(5, 1))# Call Python func in Cprint(_LIB.call_py_func(f, 5, 1)) PackedFunc实现 PackedFunc定义 ctypes可以很方便的将C/C++中的函数导出到Python,调用时直接传入对应的参数即可,但如果需要将Python函数导入到C/C++,则需要在C/C++中提前定义好对应的函数原型(比如上面的PyCFunc),并提供对应函数的调用入口(call_py_func)。为了支持更加灵活的函数定义,TVM将不同类型的函数包装成统一的函数原型。 1void(TVMArgs args, TVMRetValue *rv); 统一的函数原型被封装成PackedFunc对象,提供通用的调用接口,直接与调用者进行交互。 1234567891011class PackedFunc { public: using FType = std::function<void (TVMArgs args, TVMRetValue* rv)>; template<typename... Args> inline TVMRetValue operator()(Args&& ...args) const; inline void CallPacked(TVMArgs args, TVMRetValue* rv) const; private: /*! \\brief internal container of packed function */ FType body_;}; 当获得一个PackedFunc对象时,我们就可以像调用普通函数一样调用PackedFunc打包的函数。比如: 123PackedFunc f;// f(1, 2)首先会自动将参数1,2打包成TVMArgs,接着调用CallPacked,CallPacked最终的执行体是body_TVMRetValue ret = f(1, 2); 函数打包 TVM支持对各类函数进行打包,包括一般的函数、类的成员函数以及lamda表达式。 函数原型萃取 萃取函数原型是为了得到函数的参数和返回值类型。TVM中使用decltype和模版结构体function_signature来实现。 比如定义一个简单的C函数, 123int add(int a, int b) { return a + b;} 接下来就可以使用如下的代码来萃取add的函数原型, 1234567template <typename R, typename ...Args>struct function_signature<R(Args...)> { using FType = R(Args...);};// 萃取add的函数原型using FType = function_signature<decltype(add)>::FType; 此外只需要特化function_signature就可以支持函数指针和lambda表达式。注意:TVM function_signature不支持普通成员函数的类型萃取,因此TVM需要借助一个辅助function_signature_helper来对lambda表达式类型进行萃取,而我们这里的function_signature支持普通成员函数,因此lambda表达式类型萃取可以通过递归的function_signature来实现。 1234567891011121314151617181920212223// 普通函数指针template <typename R, typename ...Args>struct function_signature<R(*)(Args...)> { using FType = R(Args...);};// 非const类的成员函数指针template <typename T, typename R, typename ...Args> struct function_signature<R(T::*)(Args...)> { using FType = R(Args...);};// const类的成员函数指针template <typename T, typename R, typename ...Args> struct function_signature<R(T::*)(Args...) const> { using FType = R(Args...);};// lambda表达式template<typename T>struct function_signature { using FType = typename function_signature<decltype(&T::operator())>::FType;}; 函数打包 一旦萃取到了函数原型,TVM就利用TypedPackedFunc对普通函数或lambda表达式进行打包。TypedPackedFunc只支持对R(Args...)类型的函数打包,所以如果被打包的函数是一个函数指针,则需要创建一个lambda表达式,转换成R(Args...)类型之后再用TypedPackedFunc对创建的lambda表达式进行打包。 1234567891011121314151617template<typename R, typename ...Args>class TypedPackedFunc<R(Args...)> { public: using TSelf = TypedPackedFunc<R(Args...)>; template<typename FLambda, typename = typename std::enable_if< std::is_convertible<FLambda, std::function<R(Args...)> >::value>::type> TypedPackedFunc(const FLambda& typed_lambda) { // NOLINT(*) this->AssignTypedLambda(typed_lambda); } ... private: ... PackedFunc packed_;}; 当被打包的函数用来实例化TypedPackedFunc对象时,会立刻调用AssignTypedLambda将被打包的函数打包成PackedFunc。 1234567template<typename R, typename ...Args>template<typename FType>inline void TypedPackedFunc<R(Args...)>::AssignTypedLambda(FType flambda) { packed_ = PackedFunc([flambda](const TVMArgs& args, TVMRetValue* rv) { detail::unpack_call<R, sizeof...(Args)>(flambda, args, rv); });} AssignTypedLambda实际上是将被打包的函数先封装成了一个函数原型为void(const TVMArgs &args, TVMRetValue *rv)的lambda表达式,然后将这个lambda表达式作为PackedFunc对象的一个成员,通过设置合适的接口(重载operator ()),使得PackedFunc与被打包的源函数表现的完全一样了。 自动导出函数 TVM将需要从C++自动导出的函数打包成PackedFunc,然后通过宏TVM_REGISTER_GLOBAL注册到全局的一个map中。比如: 1234TVM_REGISTER_GLOBAL("_Var").set_body_typed([](std::string s, DataType t) { return VarNode::make(t, s); }); 当Python加载编译好的动态库时,会自动查询map中静态注册的函数,每个函数都包装成Python中的Function对象,最终添加到Python模块中。Function重定义了函数调用接口,自动完成参数打包过程。 如果是在Python中动态注册的函数,则需要在Python中通过函数名和来查询PackedFunc,返回一个PackedFunc的handle(函数指针),并封装成Function。 12345678910def get_global_func(name, allow_missing=False): handle = FunctionHandle() check_call(_LIB.TVMFuncGetGlobal(c_str(name), ctypes.byref(handle))) if handle.value: return Function(handle, False) if allow_missing: return None raise ValueError("Cannot find global function %s" % name) 注:TVMFuncGetGlobal是通过ctypes导出的C++接口,FunctionHandle是ctypes中表示void指针类型(c_void_p)。 从Python注册函数 由于TVM中PackedFunc的精心设计,我们只需要将Python中的函数转换成统一的函数原型void(const TVMArgs, TVMRetValue),然后将函数转换成PackedFunc并动态地注册到全局的map中。 先将Python函数用ctypes转成int(TVMValue , int , int, void , void )的C函数。 1234567TVMPackedCFunc = ctypes.CFUNCTYPE( ctypes.c_int, ctypes.POINTER(TVMValue), ctypes.POINTER(ctypes.c_int), ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p) 然后通过TVMFuncCreateFromCFunc将上面的C函数转换成统一的PackedFunc函数。 12345678910111213141516171819int TVMFuncCreateFromCFunc(TVMPackedCFunc func, void* resource_handle, TVMPackedCFuncFinalizer fin, TVMFunctionHandle *out) { API_BEGIN(); if (fin == nullptr) { *out = new PackedFunc( [func, resource_handle](TVMArgs args, TVMRetValue* rv) { int ret = func((TVMValue*)args.values, (int*)args.type_codes, // NOLINT(*) args.num_args, rv, resource_handle); if (ret != 0) { throw dmlc::Error(TVMGetLastError() + ::dmlc::StackTrace()); } }); } else { ... } API_END();} 最后通过接口TVMFuncRegisterGlobal注册到全局的map中。下面是从Python中注册一个函数,并在Python中调用的例子。 123456789101112targs = (10, 10.0, "hello")@tvm.register_funcdef my_packed_func(*args): assert(tuple(args) == targs) return 10# Get it out from global function tablef = tvm.get_global_func("my_packed_func")assert isinstance(f, tvm.nd.Function)y = f(*targs)assert y == 10","categories":[{"name":"tvm knowledge","slug":"tvm-knowledge","permalink":"https://hjchen2.github.io/categories/tvm-knowledge/"}],"tags":[{"name":"TVM","slug":"TVM","permalink":"https://hjchen2.github.io/tags/TVM/"},{"name":"PackedFunc","slug":"PackedFunc","permalink":"https://hjchen2.github.io/tags/PackedFunc/"}]},{"title":"图替换","slug":"图替换","date":"2019-12-26T05:54:04.000Z","updated":"2023-02-07T02:37:14.493Z","comments":true,"path":"2019/12/26/图替换/","link":"","permalink":"https://hjchen2.github.io/2019/12/26/%E5%9B%BE%E6%9B%BF%E6%8D%A2/","excerpt":"背景 图替换(或者叫图改写)是一种重要的图优化技术,几乎在所有的开源框架(尤其是移动端框架)中都有应用。比如tensorflow r1.14版本中就包含了155个替换子,而且实现这些替换子的总代码量接近53k行。 一些常见的图优化技术: DCE CSE(公共子表达式消除) 常量折叠 数学公式简化 Op融合 Layout变换 内存优化(swap-in/swap-out、重计算)","text":"背景 图替换(或者叫图改写)是一种重要的图优化技术,几乎在所有的开源框架(尤其是移动端框架)中都有应用。比如tensorflow r1.14版本中就包含了155个替换子,而且实现这些替换子的总代码量接近53k行。 一些常见的图优化技术: DCE CSE(公共子表达式消除) 常量折叠 数学公式简化 Op融合 Layout变换 内存优化(swap-in/swap-out、重计算) 由于目前的编译器技术通常基于low-level的中间表达,注重对局部计算的优化,对于跨多个粗粒度op的优化要不无能为力,要不就得增加编译器的分析难度并导致代码膨胀。一般来说AI框架支持的粗粒度op非常有限,而且这些op的组合常常也比较固定,比如convolution通常和bias_add、relu组合使用,因此基于高层中间表达的图替换成为一种比较可行的优化方案。经过图替换优化后的计算图再经过编译器的优化后,生成最终的硬件代码。 目前主流开源框架的图替换都是基于经验和手工设置的替换子来实现的,在这里统称为经典图替换技术。 经典图替换 图替换是将原始计算图替换成另一个优化后的等价计算图,替换后的计算图通常是硬件友好的,比如可以消除中间结果,降低内存占用,减少访存和计算量,并且不影响最终的计算结果。 在进行图替换之前,首先需要定义出源计算图到目标计算图的替换规则(替换子),由于这些替换规则往往需要依靠人的经验一条条手工去定义,因此称之为经典图替换。给出一条替换子,我们需要在原始计算图中不断地去匹配替换子的源计算子图,一旦匹配到满足要求的子图后,就将源计算子图重新映射为替换子中的目标计算图。 在一些开源框架中,替换子的定义形式不尽相同。在TensorFlow中源图匹配和替换的定义是非常松散的,它甚至没有直接定义出替换子的源图,而是定义一系列约束来判断是否匹配。PaddlePaddle中则是将一个替换过程定义为一个pass,pass执行时动态构建相应的替换子源图,执行匹配算法并回调源图到目标图的替换函数。比如下面是TensorFlow中将Conv+BiasAdd替换成FusedConv的过程。 定义匹配约束 12345678910111213141516struct ContractionWithBiasAdd { int constraction; int bias_add;}// node为输入的grapper node, pattern为输出的ContractionWithBiasAdd.bool FindContractionWithBias(node,*pattern) { // 开始列举匹配的constractions. 1、如果node存在控制边,返回false 2、如果node不是BiasAdd,返回false 3、如果node的父节点不是Conv或MatMul,返回false 4、... // 如果以上所有constructions都满足,则将需要替换的node id写到特定的pattern中。 pattern->constraction = node的父节点; pattern->bias_add = node; return true;} 定义替换过程 1234567// pattern为输入的ContractionWithBiasAdd,void AddFusedContractionNode(pattern, *invalidated_nodes) { 1、创建一个新的node:fused_op 2、将Conv或MatMul的input和filter添加到fused_op的输入中,并将BiasAdd的bias加到fused_op的输入 3、根据Conv或MatMul的一些参数设置fused_op的参数,比如conv的kernel、channel、padding等等,以及matmul的transpose等 4、将fused_op加入到graph,同时将Conv或MatMul和BiasAdd加入到invalidated_nodes} TensorFlow采用的定义匹配约束的方式与直接定义出子图的方式本质上是等价的,但相比后者可读性较差,而优点就是代码可复用性高,比如上面的FindContractionWithBias可以同时匹配Conv+BiasAdd和MatMul+BiasAdd两种子图,并且这些约束便于嵌套使用。 无论是TensorFlow还是PaddlePaddle,图替换都是不完全的。比如说对于Conv+BiasAdd+BiasAdd这种计算图,第一次只能匹配到Conv+BiasAdd,替换后又变成了一个Conv+BiasAdd的计算图,因此TensorFlow中默认采用了两遍优化。根据TensorFlow公开的一些数据,基本上第二次优化的机会已经非常少了。 InceptionV3 Seq2Seq 基于超优化的图替换 超优化(Superoptimization)是现代编译器中的一种指令优化技术,其主要工作原理是通过随机生成指令序列以及暴力搜索的方式自动找到一组优化的指令序列,并等价替换原有的指令序列。1992年第一个Superoptimizer被集成到了GCC编译器,之后Google也为LLVM开发了一个Superoptimizer,取名为Souper。 依靠人工设定的编译器往往对代码的优化不够彻底,给生成的code留下了大量的优化空隙,而且人工设定的优化规则往往没有经过充分验证,经常导致各种极端条件下的代码bug。Superoptimization将指令序列优化问题转换为自动搜索问题,并加入了自动化验证和一阶逻辑验证,在发现代码优化空隙的同时优化结果也更加可靠。 TASO(Tensor Algebra SuperOptimizer)将Superoptimization用于DNN高层中间表达的图优化,在大多数模型上取得了比XLA和TensorRT更优的效果。TASO的工作是MetaFlow(作者另一个基于人工规则的图替换框架)的延续,因此也采用了与MetaFlow一致的替换子定义。MetaFlow替换子的定义包括:源图、目标图、输入和输出的映射关系。 TASO相比其他开源框架最大的区别就是不需要手工去设定各种各样的替换子,只需要像设计硬件指令一样设计出基本的算子定义(或者计算逻辑),之后系统会根据指定的算子集自动生成满足条件的替换子,经过验证的替换子最终作用于图替换过程。基于高度抽象的替换子定义,TASO可以独立于具体的训练或预测框架,离线完成替换子的生成和验证,并在图优化阶段加载到程序中进行图替换。尽管手工设计有很多弊端,但TASO在代码实现过程中并没有完全抛弃手工设计的方式,而是采用了手工设计和替换子自动生成相结合的方式。 替换子定义 替换子包含三个部分,源图、目标图、输入和输出tensor的映射关系。并且替换子通常是与shape无关的,源图和目标图都是由算子构成的,每个算子都可以指定一些配置,比如kernel指定卷积核的大小、axis指定reduce的维度等等。 但需要注意的是concat和split两个算子,在图替换中这两个算子通常用于算子融合,比如下图对两个不同的输入B和C进行相同的MatMul操作,就可以替换为先将输入B和C进行一次合并,然后调用一次MatMul后,对结果进行切分得到两个输出X和Y。 为了能正确切分出X和Y,在Concat时我们需要给每个维度维护一个分割树(split tree)。一个行分割的例子如下,图中需要将A和B按照第0维进行concat,因此输入A在第0维有一个原始的分割树[0, \\(S_{A}\\)],表示对于tensor A,第0维从0到\\(S_{A}\\)行都是A的数据区域。A和B concat后tensor的row变成了\\(S_{A}+S_{B}\\),并且通过分割树可以知道第0到\\(S_{A}\\)行是A的数据,从\\(S_{A}\\)到\\(S_{A}+S_{B}\\)行是B的数据。根据分割树,Split非常容易地就可以将数据进行切分。TASO的分割树支持任意维度的切分和递归切分。 替换子生成 替换子生成包含两个阶段:构建搜索空间,以及对潜在的替换子进行测试。 构建搜索空间 搜索空间由任意合法的计算图构成,而计算图由给定的算子集中的算子组成。TASO向我们表明了一种暴力枚举、深度优先递归构建的方法。 给定算子集和初始化的input tensor集合,对于每一个输入tensor,每次从算子集中选择一个合法的算子构建graph,并计算当前graph的输出tensor,将输出tensor加入到input tensor集合, 保存graph以及graph的fingerprint(对输出tensor计算hash值),接着重复上面的过程继续加入算子,直到递归的深度达到设定的上限。 对于同样的输入tensor,如果构建的两个计算图的输出tensor相同,则这两个计算图构成了一个潜在的替换子。为了避免出现浮点计算异常的情况,构建计算图时所有的tensor都是int类型。 测试潜在替换子 为了进一步验证潜在替换子的合法性,TASO设计了一系列cases来测试潜在替换子。每个测试case都使用随机初始化的输入tensor,当两个计算图结果一致时才认为测试通过,只有所有测试cases都通过的潜在替换子才是合法的替换子。 与构建计算图时使用int类型的tensor不一样,所有测试case的输入tensor都是-1到1之间的浮点数。由于relu对于所有小于0的值都返回0,因此可能导致非法的替换子通过测试cases,作者认为可以使用任意一个非线性函数来代替relu,TASO中使用\\(x(x+1)+1\\)。 替换子验证 TASO同时使用一阶逻辑表达的算子属性对替换子进行进一步验证,这些属性通常是由人工定义,并且经过充分review和大量测试验证过的。 在定义算子属性之前,首先需要对算子进行符号建模,算子模型通常包含参数和输入tensors。比如\\(conv(s, p, c, x, y)\\)表示conv算子的符号模型,\\(s\\),\\(p\\),\\(c\\)是conv的参数,分别表示stride、padding和activation,\\(x\\)和\\(y\\)是卷积操作的两个输入。如果activation是none,很显然conv就是一个线性操作,因此满足以下属性: \\[ \\begin{aligned} ∀s,p,x,y,z. conv(s,p,Anone,ewadd(x,y),z) = \\\\ ewadd(conv(s,p,Anone,x,z),conv(s,p,Anone,y,z)) \\end{aligned} \\] TASO定义了大量的算子属性,并且使用z3(一阶逻辑验证器)对所有合法的替换子进行验证。如果有合法的替换子无法被一阶逻辑验证,则需要根据替换子手动添加一条算子属性,以确保所有合法的替换子都能验证通过。 冗余替换子裁剪 自动生成的替换子往往存在大量的冗余,TASO使用了两种策略消除冗余。 Input tensor renaming 对输入进行重命名的方式消除不同替换子之间的冗余。比如下面两个替换子, 替换子a: 替换子b: 将替换子a的一个输入tensor A改名为C,就得到了替换子b,说明这两个替换子存在冗余,因此最终只会保留更加通用的替换子b。 Common subgraph 如果替换子的源图和目标图包含同样的子图,则可以用一个相同的tensor替换掉公共子图。比如下面的一个替换子, source graph和target graph包含同一个子图(B x C),将source graph替换成target graph时,公共子图没有任何变化,因此可以将子图消除。 实验结果显示,裁剪可以消除大量的冗余替换子。 低精度和layout优化 相关资料 https://cs.stanford.edu/~zhihao/papers/sosp19.pdf https://github.com/jiazhihao/TASO TensorFlow Graph Optimizations, https://web.stanford.edu/class/cs245/slides/TFGraphOptimizationsStanford.pdf https://github.com/google/souper","categories":[{"name":"graph optimization, 图优化","slug":"graph-optimization-图优化","permalink":"https://hjchen2.github.io/categories/graph-optimization-%E5%9B%BE%E4%BC%98%E5%8C%96/"}],"tags":[{"name":"图替换","slug":"图替换","permalink":"https://hjchen2.github.io/tags/%E5%9B%BE%E6%9B%BF%E6%8D%A2/"},{"name":"超优化","slug":"超优化","permalink":"https://hjchen2.github.io/tags/%E8%B6%85%E4%BC%98%E5%8C%96/"},{"name":"graph optimization","slug":"graph-optimization","permalink":"https://hjchen2.github.io/tags/graph-optimization/"},{"name":"super optimization","slug":"super-optimization","permalink":"https://hjchen2.github.io/tags/super-optimization/"},{"name":"substitution","slug":"substitution","permalink":"https://hjchen2.github.io/tags/substitution/"}]},{"title":"FusionStitching, Deep Fusion and Code Generation for Tensorflow Computations on GPUs","slug":"DeepFusion","date":"2019-11-27T04:00:04.000Z","updated":"2023-02-07T02:39:43.223Z","comments":true,"path":"2019/11/27/DeepFusion/","link":"","permalink":"https://hjchen2.github.io/2019/11/27/DeepFusion/","excerpt":"FusionStitching系统概述 屏幕快照 2019-11-25 13.56.40 输入HloModule,经过以下三个阶段,最终输出LLVM IR。 Computation Fusion Schedule Planning Code Generation 论文主要针对XLA Fusion算法进行了改进,提出了实现Block合并策略的Schedule和Shared Memory Planning技术,以及实现对应的IR Emitter。","text":"FusionStitching系统概述 屏幕快照 2019-11-25 13.56.40 输入HloModule,经过以下三个阶段,最终输出LLVM IR。 Computation Fusion Schedule Planning Code Generation 论文主要针对XLA Fusion算法进行了改进,提出了实现Block合并策略的Schedule和Shared Memory Planning技术,以及实现对应的IR Emitter。 Computation Fusion 利用Work/Span analysis,将instruction划分到不同的layer,然后Deep Fusion模块在Schedule Consistency Checker的指导下完成跨layer的instruction合并。该过程是迭代进行的,直到完全没有合并机会。 Work/Span analysis Work/Span analysis通常用于并行算法的分析。假设每个基本运算执行时间都是单位时间,则Work表示的是所有基本运算时间总和,Span表示最长依赖路径上的基本运算时间总和。对于一个计算图来说,可以简单认为图中所有的计算节点总执行时间表示Work,而计算图的最大深度的路径上的节点的顺序执行总时间表示Span。 在这里作者用Span来表示每个节点到root节点的深度。 屏幕快照 2019-11-26 18.28.17 经过Work/Span analysis后,HloModule中的Instruction被划分到了不同的layer,相同Span值的Instruction的layer相同,并且同一layer的Instruction没有依赖关系。 Subgraph Fusion Algorithm 基于Work/Span analysis计算得到的Span值,作者提出了不同于XLA的Fusion算法。 SchdConsistent用来判断fusion_root和hlo是否应该合并,其具体的执行逻辑如下: 如果hlo有一个consumer在giveup集合中,为了防止潜在的循环依赖,放弃fusion。 如果hlo的所有consumer都不在fused集合中,则放弃fusion,因为这里只考虑Producer/Consumer的合并,没有消费关系的Instruction合并则会在ElementwiseFusion算法中处理。 最后会判断合并后的Computation是否存在一个可行的optimized shedule。如果不存在,则放弃fusion。 算法简单高效,Work/Span Analysis的作用其实相当于对Instruction做了一遍拓扑排序,通过简单的合并规则确保circle free。 不区分expensive op,可以通过shared memory来缓存中间结果,因此不需要重计算。 由于第一条约束的强制性,导致合并不完全。 Schedule Planning Schedule定义 Schedule通常指的是将算法指定的计算过程分配给计算资源的方法。这些计算过程可能包括线程、进程以及数据流等。 常见的一些Schedule配置: - Reorder 循环顺序重排,比如for x for y -> for y for x - Tile - Unroll - Vectorize - Parallel - some CUDA-specific 比如blocks、threads、shared memory size等。 对于包含多个计算stage的算法,Schedule通常是由Consumer驱动,并指定何时对Consumer计算Producer(Specify when the producer is computed with respect to the consumer )。 论文中将Instruction大致分成Elementwise、Transpose、Reduce、BatchDot、Reshape和Broadcast这几种,然后基于这些op定义了一套用来表示对数据分块的Shedule配置。通过一个定义好的Shedule配置和数据的shape,我们就可以知道需要切成多少个数据块,映射到GPU硬件上就是多少个线程块(thread blocks)。 屏幕快照 2019-11-27 11.22.57 Shedule定义在输出shape上,包含三个字段:split_dim、sword和sched_type。split_dim表示切割的维度,取值[0, num_dims)。sword表示在split_dim维度上切分多少块,sword要求能被split_dim维度K整除。sched_type表示行切割还是列切割,取值Row或者Column。给定一个Instruction,其Schedule空间即所有合法的三元组(split_dim、sword和sched_type)。 上图表示Reduce Instruction的两种合法Schedule,通过split_dim和reduce dim来区分Row Schedule和Column Schedule。 Schedule约束和传播 与Instruction的Schedule定义在输出shape上一样,Computation的Schedule也定义在Root Instruction的输出上,因为Root Instruction是整个Computation的输出。 对于一个Fused Computation,需要满足Shedule相容约束:即对于Root Instruction,给定一个合法的Shedule,该Shedule需要同时被其他Instruction相容。论文中提出后向传播的方法来判断Shedule约束的相容性。 任意一个Instruction,其合法的Schedule可以根据Instruction类型和output shape来确定。如果给定的Schedule满足约束(是合法的),则把Schedule后向传播到输入shape(s),也就是输入Instruction的输出shape。否则从Root Instruction传播过来的Schedule在整个Fused Compution上不满足相容性约束。 在Subgraph Fusion算法中,两个Instruction能否合并除了需要满足circle free约束外,还需要满足后端CodeGen模块的支持。只有Schedule满足约束,CodeGen才能正确发射代码,否则CodeGen无法处理。 屏幕快照 2019-11-27 13.53.21 Table.1表明了不同Instruction的Schedule后向传播规则。Schedule约束判断结果会反馈到Subgraph Fusion过程,Fusion不应该破坏Schedule相容性约束。 Schedule Tuning 任意一个Instruction,split_dim=0和sword=1的Row Schedule总是合法的,也就是只有一个数据块,并且只用一个GPU线程块来计算。这样做的问题也很明显,就是无法充分利用GPU硬件资源。每个Instruction可能有多个合法的Schedule,Schedule Tuning用来选择一个合适的Schedule。 如果Computation中只有一个Root,遍历该Root Instructon所有合法的满足约束的Schedule,在performance library中查找每个kernel的执行时间,并统计总耗时。总耗时最少的Schedule会被选择用来Code Generation。 如果Computation中有多个Roots,则采取一种two-stage的方法加速Schedule的搜索过程。 第一步:遍历所有的Roots,计算blocks和blocks对应的Schedule两个序列。对所有Roots对应的blocks序列求交集,结果对应的Schedule即合法的候选Schedule。 第二步:遍历所有的候选Schedule,计算每个Schedule下所有kernel的耗时,选择耗时最少的Schedule。论文中还提到可以忽略部分ops和early stop的搜索策略,加速第二步的搜索过程。 Code Generation Shared Memory Planning 标记出所有可能需要用到Shared Memory的候选ops,当Memory不足时优先满足most critical ops。 Size Requirement Analysis 直接分配 对于非Root Instruction的Reduce和BatchDot,必须将中间结果放在Shared Memory,allowing consumer ops to use seperate parallel loop emitters to generate code。 按优先级分配 对于有多个Users的Elementwise op,为了避免重计算,可以选择将结果缓存到Shared Memory。在memory受限的情况下,按照优先级(expensive op > 非expensive op)确定使用Shared Memory。 有时对于只有一个User的expensive op也需要用到Shared Memory,比如如果expensive op后面接了一个BatchDot,由于BatchDot本身对数据的复用性比较高,将expensive op的结果缓存到Shared Memory会带来非常好的性能优化。 Size Shrinking Size Shrinking与上面Requirement Analysis的第2点类似。当每个线程Block分到的数据块非常大时,可能存在Shared Memory受限的问题。解决办法就是让一些ops退化为重计算。 从inexpensive ops开始,然后expensive ops,之后是带有BtachDot的expensive ops,最后按照靠近Root Instruction的程度选择候选ops,并优先选择靠近输出的ops。 Space Sharing 不同ops分配的Shared Memory是可以复用的,论文中作者提出从Root Instruction开始构造一颗支配树,支配节点可以复用被支配节点申请的Shared Memory。 Code Generation XLA使用GpuElementalIrEmitter来实现线程合并的Computation。基于XLA的GpuElementalIrEmitter,作者实现了用于Block合并的IrEmitter (论文中称作IrEmitterStitched)。 IrEmitterStitched输入有hlo、root、shared、schedule和generators。 hlo: 待处理的hlo Instruction root: 是否是root Instruction shared: 是否将输出写到shared memory shedule: row schedule还是column schedule generators:与XLA GpuElementalIrEmitter中的generators_类似,但是能处理shared memory的情况。 基本逻辑如下: 如果待处理的Instruction不是root Instruction,并且没有用到Shared Memory,不是Dot和Reduce Opcode,则回退到XLA的GpuElementalIrEmitter中去处理,否则使用IrEmitterStitched发射LLVM代码。 如果需要用到Shared Memory,则调用EmitWriteSharedArray将结果写到Shared Memory。 如果是root Instruction,则调用EmitWriteOutputArray将结果写到Global Memory。如果不是root Instruction,则调用EmitGenerator在generators中创建一个entry,以支持当前Instruction与其他Instruction的合并。 XLA op fusion规则 Consumer本身支持合并 特定op不支持与Producer合并,比如Parameter、While、Conditional、Call等,以及op本身has a side effect或者调用了has a side effect的op。此外被标记为tracing的op也无法合并。 Consumer与Producer之间支持合并 Consumer和Producer之间所有的op均可以被合并到Consumer。 对于Consumer和Producer之间所有的op: 如果直接Producer已经是一个Fusion op,则不能合并。 对Reduce和Scatter,以及CustomCall/LibraryCall的一些限制。 如果直接Producer有其他Consumer,则Fusion会导致该Producer 需要重计算。如果Producer属于expensive op或为Parameter op则放弃合并。","categories":[{"name":"DL Compiler","slug":"DL-Compiler","permalink":"https://hjchen2.github.io/categories/DL-Compiler/"}],"tags":[{"name":"XLA","slug":"XLA","permalink":"https://hjchen2.github.io/tags/XLA/"},{"name":"Deep Learning Compiler","slug":"Deep-Learning-Compiler","permalink":"https://hjchen2.github.io/tags/Deep-Learning-Compiler/"},{"name":"FusionStitching","slug":"FusionStitching","permalink":"https://hjchen2.github.io/tags/FusionStitching/"}]},{"title":"混合精度训练","slug":"混合精度训练","date":"2018-02-03T04:00:04.000Z","updated":"2023-02-07T02:39:53.138Z","comments":true,"path":"2018/02/03/混合精度训练/","link":"","permalink":"https://hjchen2.github.io/2018/02/03/%E6%B7%B7%E5%90%88%E7%B2%BE%E5%BA%A6%E8%AE%AD%E7%BB%83/","excerpt":"MIXED PRECISION TRAINING https://arxiv.org/pdf/1710.03740.pdf 论文概述 nvidia的Pascal和Volta系列显卡除了支持标准的单精度计算外,也支持了低精度的计算,比如最新的Tesla V100硬件支持了FP16的计算加速,P4和P40支持INT8的计算加速,而且低精度计算的峰值要远高于单精浮点的计算峰值。","text":"MIXED PRECISION TRAINING https://arxiv.org/pdf/1710.03740.pdf 论文概述 nvidia的Pascal和Volta系列显卡除了支持标准的单精度计算外,也支持了低精度的计算,比如最新的Tesla V100硬件支持了FP16的计算加速,P4和P40支持INT8的计算加速,而且低精度计算的峰值要远高于单精浮点的计算峰值。 为了加速训练过程以及减少显存开销,baidu Research和nvidia在这篇论文中合作提出了一种FP16和FP32混合精度训练的方法,并且在CNN分类和检测、语音识别和语言模型任务上进行了验证,实验过程中使用的GPU就是Tesla V100。 训练过程中每层的权重都存成FP32格式(Mater-Weights),每次训练时都会将FP32的权重降精度至FP16( a master copy),前向输出和后向梯度都使用FP16进行计算,更新时将FP16的梯度累加到FP32的Mater-Weight上。 混合精度的必要性 由于FP16所能表示的subnormal最小正数是\\(2^{−24}\\) ≈ \\(5.96 × 10^{−8}\\)(Half-precision floating-point format),也就是说在区间(\\(-2^{-24}, 2^{-24}\\))的数(或者说指数位小于-24的数)使用FP16表示时都会变成0。在一个普通话识别的模型训练中,有将近5%的权重梯度的指数位小于-24,如果更新时也用FP16计算,那么这些数在乘以学习率后都将变成0,从而对最终模型效果产生负面影响,使用混合精度训练的方式可以避免这种问题。 Loss scaling 混合精度训练可以解决权重更新量很小的问题,但无法解决梯度本身很小的问题。在一些网络中(比如SSD),梯度大部分都在FP16的表示范围之外,因此需要将梯度平移到FP16的表示范围内 。 平移实际上就是对梯度值乘以一个系数(等于\\(2^{n}\\),\\(n\\)为平移的位数),但另一种简单高效的方法是直接在前向时就将loss乘以scale,这样在后向传导时所有的梯度都会被乘以相同的scale。权重更新时需要将移位后的梯度除以scale后,再更新到权重上。 论文中提到他们在实验过程中使用的scale是8~32K,最终取得了与FP32一致的收敛结果。对于scale的选择,论文没有统一的方法,只是提到scale并没有下界,只要选择的scale不会在后向计算时导致溢出就行。 实验结果 图像分类 物体检测 语音识别 机器翻译 语言模型 MIXED PRECISION TRAINING OF CONVOLUTIONAL NEURAL NETWORKS USING INTEGER OPERATIONS https://openreview.net/forum?id=H135uzZ0- 论文概述 半精度(16bit)分为半精度浮点(FP16)和半精度定点(INT16),FP16和INT16提供不同的精度和表示范围。INT16相比FP16的动态范围低,但精度更高,因此INT16相比FP16会带来更低的精度误差。 现在深度学习领域公认的数据类型是单精度浮点(float),半精和单精除了在直观感觉上的数据类型不同之外,在计算(algorithmic)和语义(semantic)上也会有很多的不同,比如说FP16的乘加操作得到的结果是FP32。因此在讨论半精度训练时,对于整个tensor的表达、乘加操作、低精度转换、缩放和规整方法和溢出处理都是需要同时考虑的。 intel的这篇论文主要受到之前flexpoint和混合精度训练的启发,从而提出了一种共享指数位的动态定点表达(dynamic fixed point representation)方法,使用INT16和float混合精度训练,在完全不进行任何调参的情况下,在多个CNN的模型上取得了当前所有低精度训练方法中最好的效果。 这篇论文主要涉及的技术点有: DFP:INT16的Tensor共享指数位,扩充INT16的动态表示范围。 instruction:两个INT16进行乘法,结果存为INT32的指令。 down-convert:基于最大值的低精度转换策略,使用nearest、stochastic和biased rounding三种不同的rounding方法。 overflow management:将局部的INT32结果累加到FP32,防止累加时溢出。 DFP(Dynamic Fixed Point) 一个DFP tensor由一个定点的tensor和该tensor共享的指数组成,更通用的表示形式为DFP-P = \\(<I, E_{s}>\\),P表示定点tensor \\(I\\)的位宽,\\(E_{s}\\)表示共享指数位。标准单精使用的是8bit的指数位,在该论文中使用的DFP-16共享指数位也是8bit。 DFP-16和fp32的数据转换 共享指数位需要根据tensor中的绝对值最大的数和定点化的位宽来确定,计算公式如下: \\[E_{s} = E_{fmax} - (P - 2)\\] \\(E_{s}\\)表示DFP-P的共享指数,\\(E_{fmax}\\)表示原始fp32 tensor中绝对值最大的数对应的指数\\(E_{fmax} = E(max_{\\forall f \\in F} |f|)\\) 因此fp32的tensor与DFP的tensor有以下关系: \\[\\forall i_{n} \\in I, \\ \\ \\ f_{n} = i_{n} \\times 2^{E_{s}}, \\ \\ \\ where f_{n} \\in F\\] 也就是说\\(i_{n} = rounding(\\frac{f_{n}}{2^{E_{s}}})\\),这本质上与loss scaling思想是一样的,用平移的思想来解决动态范围不够的问题。 DFP-16 tensor的乘加运算规则 1、两个DFP-16 tensor相乘,结果存为DFP-32。 \\[i_{ab} = i_{a} \\times i_{b} , \\ \\ \\ E_{s}^{ab} = E_{s}^{a} + E_{s}^{b}\\] 2、两个DFP-16 tensor相加,结果存为DFP-32。 \\[i_{ab} = \\left\\{\\begin{aligned} i_{a} + (i_{b} >> (E_{s}^{a} - E_{s}^{b})) \\ \\ \\ when E_{s}^{a} > E_{s}^{b} \\\\ i_{b}+(i_{a} >> (E_{s}^{b}-E_{s}^{a})) \\ \\ \\ when E_{s}^{a} < E_{s}^{b} \\end{aligned}\\right.\\] \\[E_{s}^{a+b} = max(E_{s}^{a}, E_{s}^{b})\\] 3、两个DFP-32 tensor相加,结果保存为fp32。 DFP-32和DFP-16的数据转换 \\[R_{s} = P - LZC(max_{\\forall i_{ab} \\in I^{32}}|i_{ab}|)\\] \\[i_{ab}^{d} = i_{ab} >> R_{s} , \\ \\ \\ E_{s}^{ab} += R_{s}\\] DFP混合精度训练 指令实现 intel的VNNI指令集中有一条DFP-16乘加的指令QVNNI16,这条指令的第一个操作数是DFP-16内存指针,第二个操作数是4个512位的向量寄存器(每个寄存器可以存储32个DFP-16),结果是一个512位的向量寄存器(该寄存器能存储16个DFP-32)。 上面的QVNNI16指令集实际上对mem输入做了两路并行展开,vinp2中一个寄存器支持同时对输入feature map的两个channel进行计算。在论文中,卷积层输入的格式为(N,C/16,H,W,16),权重的格式为(C/16,K/16,KH,KW,8c,16k,2c),C表示输入feature map的通道数,K表示输出通道数,KH和KW分别表示卷积核的height和width。 卷积计算过程伪代码: 每次对输入的ICBLK个通道进行计算,ICBLK个通道又会分成(ICBLK/16)组,每组计算16个通道,由于QVNNI指令每次只能对输入的8个通道进行计算,因此每组调用2次QVNNI16指令,计算结果vout会转换成FP32后与output累加。 实验结果 baseline和DFP-16的实验均在intel最新的Knights-Mill CPU上进行,DFP-16相比FP32训练加速1.8X。 ABS_MAX量化方案 DFP与ABS_MAX量化的区别","categories":[{"name":"low bitwidth","slug":"low-bitwidth","permalink":"https://hjchen2.github.io/categories/low-bitwidth/"}],"tags":[{"name":"int16","slug":"int16","permalink":"https://hjchen2.github.io/tags/int16/"},{"name":"fp16","slug":"fp16","permalink":"https://hjchen2.github.io/tags/fp16/"},{"name":"混合精度训练","slug":"混合精度训练","permalink":"https://hjchen2.github.io/tags/%E6%B7%B7%E5%90%88%E7%B2%BE%E5%BA%A6%E8%AE%AD%E7%BB%83/"},{"name":"loss scaling","slug":"loss-scaling","permalink":"https://hjchen2.github.io/tags/loss-scaling/"},{"name":"QVNNI16","slug":"QVNNI16","permalink":"https://hjchen2.github.io/tags/QVNNI16/"}]},{"title":"模型压缩之pruning","slug":"模型压缩论文阅读记录","date":"2018-01-02T14:00:04.000Z","updated":"2023-02-07T02:40:11.362Z","comments":true,"path":"2018/01/02/模型压缩论文阅读记录/","link":"","permalink":"https://hjchen2.github.io/2018/01/02/%E6%A8%A1%E5%9E%8B%E5%8E%8B%E7%BC%A9%E8%AE%BA%E6%96%87%E9%98%85%E8%AF%BB%E8%AE%B0%E5%BD%95/","excerpt":"Regularization of Neural Networks using DropConnect DropConnect主要是用来解决全连接过拟合问题的,是Dropout的通用实现。随着神经网络参数量越来越大,过拟合的风险越来越高,之前的一些经验是使用L1/L2以及Dropout。Dropout随机地将激活函数输出置0,导致每次参与训练的参数量变少,由于随机drop的关系,每次训练的网络都可能不一样,因此实际上我们训练的是多个子模型组成的混合模型。","text":"Regularization of Neural Networks using DropConnect DropConnect主要是用来解决全连接过拟合问题的,是Dropout的通用实现。随着神经网络参数量越来越大,过拟合的风险越来越高,之前的一些经验是使用L1/L2以及Dropout。Dropout随机地将激活函数输出置0,导致每次参与训练的参数量变少,由于随机drop的关系,每次训练的网络都可能不一样,因此实际上我们训练的是多个子模型组成的混合模型。 Dropout 如果考虑激活函数为tanh和relu,则dropout的输出: \\[r=m*a(Wv)=a(m*(Wv))\\] inference时混合模型的输出: \\(o=E_{M}[a(M*(Wv))] \\approx a(E_{M}[(M*W)v])=a(pWv)\\) \\(M\\)是\\(m\\)的repeat得到的矩阵。 DropConnect 随机地将全连接层的权重值置0,即输出为: \\[r=a((M*W)v)\\] \\(M\\)是与\\(W\\)大小一致的0-1矩阵,并且\\(M_{ij}\\)服从Bernoulli(p)分布。 inference时混合模型的输出: \\[o=E_{M}[a((M*W)v)] \\approx E_{u}[a(u)] \\] where \\(u\\sim N(pWv, p(1-p)(W*W)(v*v))\\) 注:对于\\(u\\)的分布论文中提到用高斯矩匹配估计,但也可以用中心极限定理进行估计 训练时的伪代码: inference时的伪代码: 实验结果 总结 DropConnect的初衷是解决过拟合问题的,DropConnect虽然在训练时可以将稠密矩阵乘转化成稀疏乘的方式,减少计算量,但在inference时还是需要完整的计算一遍,然后再利用正态分布多次采样后计算均值得到下一层的输入,因此inference的计算量反而增加了。论文给出的实验结果表明DropConnect在tanh和relu激活函数时会比dropout带来更低的测试错误率,sigmoid时会比dropout差点。DropConnect给模型压缩提供了一些思路,在训练时我们都倾向于选择更复杂的模型而需要非常大的计算量,DropConnect的做法表明这些复杂的模型实际上有大量的冗余,而去除这些冗余后并不会对模型产生任何伤害,反而会增强模型的泛化能力,因此在模型压缩中,对模型进行剪枝成了一个重要的研究方向。 ##Learning bothWeights and Connections for Efficient Neural Network 作者首先关注到神经网络预测时的能耗问题,下面给出了一个45nm的CMOS处理器能耗表。 内存读取的能量消耗比其他数学指令高出三个数量级,因此论文提出对神经网络进行剪枝以压缩模型大小,减少内存读取消耗并降低计算量。剪枝不仅降低了模型复杂度,也减少了过拟合。除了剪枝,文中也提到可以借鉴HashedNets的方法进行模型参数共享,进一步降低模型大小。 模型剪枝分成三步: 1、正常训练模型,得到每个连接的重要程度(重要程度可以用权值的绝对值表示) 2、删除重要程度低的连接,将稠密网络转换成稀疏网络 3、使用保留下来的连接重训模型 第2步和第3步迭代进行。 正则化 关于正则化对剪枝结果的影响,论文给出的结论是:剪枝后重训前L1正则比L2效果好,但重训后L2比L1效果好。 Dropout Ratio调整 Dropout仍然被用来抑制过拟合,但是由于剪枝会减小模型大小,因此重训时Dropout ratio也应该更小。 \\[D_{r}=D_{0}\\sqrt{\\frac{C_{ir}}{C_{io}}}\\] \\[C_{i}=N_{i}N_{i-1}\\] 其中\\(D_{r}\\)为重训的ratio,\\(D_{0}\\)为原始的ratio,\\(N_{i}\\)为第\\(i\\)层的神经元个数。 重训参数 由于神经网络的连续层往往保持耦合性,因此重训模型时最好保持连接的权重,而不是重新初始化。并且卷积层和全连接层的剪枝是交替进行的,对fc进行剪枝重训时需要保持conv不变,反之对conv进行剪枝重训时需要保持fc不变。 迭代剪枝 迭代剪枝的方式可以最大程度的压缩模型大小。在不损失效果的前提下,相比单次剪枝,多次迭代的方式可以将AlexNet的压缩率从5X提高到9X。 裁剪神经元 每次剪枝可以将那些没有输入连接或没有输出连接的神经元移除。无输出的神经元对最终模型结果没有任何影响,因此移除也不会对模型效果产生影响,而那些没有输入连接的神经元由于梯度下降和正则化最终也会变成无输出的神经元。 实验结果 文中将裁剪门限设置为一个质量参数乘以这一层权重的标准差,并在LeNet、AlexNet和VGG-16上进行了相关实验,卷积层也可以跟全连接层一样使用相同的剪枝策略,重训模型时会有一次调整学习率的过程,比如LeNet重训时学习率会衰减到原来的1/10,AlexNet会衰减至原来的1/100。 AlexNet各层的压缩情况: 剪枝与其他模型压缩方法的对比: 模型保存 稀疏矩阵在保存时需要同时保存indices,比如按照CSR格式保存时,我们除了保存所有的非零元素外,还需要保存每个元素对应的列号以及每行第一个非零元素在所有元素中的位置。为了压缩保存indices带来的开销,文中提到使用相对indices代替绝对indices,全连接层可以使用5bit来表示相对indices,而卷积层也可以只使用8bit。 总结 由于卷积层本身就是稀疏连接,相比fc对剪枝更敏感,因此剪枝方法对于全连接层的压缩率更高。剪枝只能压缩模型大小,但inference时并不会带来预测速度提升。intel在16年提出另一个剪枝与嫁接相结合的方法Dynamic Network Surgery for Efficient DNNs,进一步提高了剪枝方法的压缩率和重训收敛速度,此外2017年孙剑等提出了针对卷积层的Channel Pruning方法,可以结合此处的剪枝方法,应该可以达到更好的压缩效果。 ##Channel Pruning for Accelerating Very Deep Neural Networks","categories":[{"name":"model compression","slug":"model-compression","permalink":"https://hjchen2.github.io/categories/model-compression/"}],"tags":[{"name":"pruning","slug":"pruning","permalink":"https://hjchen2.github.io/tags/pruning/"}]},{"title":"NEURAL MACHINE TRANSLATION论文学习串讲","slug":"seq2seq串讲","date":"2017-12-01T04:24:08.000Z","updated":"2023-02-07T02:38:51.695Z","comments":true,"path":"2017/12/01/seq2seq串讲/","link":"","permalink":"https://hjchen2.github.io/2017/12/01/seq2seq%E4%B8%B2%E8%AE%B2/","excerpt":"seq2seq 主要学习的是论文Neural machine translation by jointly learning to align and translate (Dzmitry Bahdanau、Yoshua Bengio等,2016.05)和Neural machine translation (Minh-ThangLuong,2016.12)。 神经机器翻译的目的是将一门语言的文本序列翻译成另一门语言的文本序列,因此机器翻译的训练语料一般是源语言和目标语言组成的一对文本,也叫做平行语料(parallel corpus)。我们通常将输入和输出都是序列的模型叫做seq2seq,seq2seq不仅应用在机器翻译领域,也用于当前热门的自动问答系统以及文本摘要的自动生成等领域。","text":"seq2seq 主要学习的是论文Neural machine translation by jointly learning to align and translate (Dzmitry Bahdanau、Yoshua Bengio等,2016.05)和Neural machine translation (Minh-ThangLuong,2016.12)。 神经机器翻译的目的是将一门语言的文本序列翻译成另一门语言的文本序列,因此机器翻译的训练语料一般是源语言和目标语言组成的一对文本,也叫做平行语料(parallel corpus)。我们通常将输入和输出都是序列的模型叫做seq2seq,seq2seq不仅应用在机器翻译领域,也用于当前热门的自动问答系统以及文本摘要的自动生成等领域。 Encoder-Decoder 2014年Dzmitry Bahdanau、Yoshua Bengio等人在论文Learning Phrase Representations using RNN Encoder–Decoder for Statistical Machine Translation中首次提出将RNN Encoder-Decoder结构来计算双语短语对的条件概率,用于改进统计机器翻译的效果。Encoder-Decoder是由encoder和decoder两部分组成,encoder将输入序列编码成定长的语义向量,decoder将语义向量进行解码得到目标序列。 在NMT中Encoder-Decoder试图直接对并行语料的条件概率\\(P(Y|X)\\)进行建模,encoder输入的是一组向量序列\\(X=(x_{1},…,x_{T_{x}})\\),\\(x_i\\)为词\\(i\\)的one-hot编码向量,并将序列\\(X\\)编码成语义向量\\(c\\),decoder输入语义向量\\(c\\),并逐个生成序列\\(Y=(y_{1},…,y_{T_{y}})\\),其中\\(y_{i}\\)的生成与之前已经生成的词序列\\(y_{1},…,y_{i-1}\\)有关。 \\[\\log p(Y|X)=\\sum_{t=1}^{T_{y}}\\log p(y_{t}|y_{<t}, c)\\] 对于不定长度序列的编码和解码,我们很自然会想到RNN,实际上RNN Encoder–Decoder就是正反两组RNN拼接在一起组成的编码解码网络。经典的RNN Encoder–Decoder示意图如下: 我们可以用下面公式描述编码过程: \\[h_{t}=f(x_{t},h_{t-1})\\] \\[c=q({h_{1},…,h_{T_{x}}})\\] 函数\\(f\\)一般用一个RNN结构来表示,可以是LSTM、GRU等,\\(h_{t}\\)表示encoder RNN在第t时刻的cell隐状态,向量c的计算与encoder RNN所有时刻的cell隐状态相关,函数\\(q\\)可以表示所有隐状态的加权和,但由于RNN的特殊性,我们这里只使用最后一个时刻的隐状态作为向量\\(c\\),即\\(c=h_{T_{x}}\\)。 对于解码过程,生成\\(y_{t}\\)时的条件概率可以改写成 \\[p(y_{t}|y_{<t},c)=g(y_{t-1},s_{t},c)\\] \\[s_{t}=f(s_{t-1},y_{t-1},c)\\] 其中,\\(g\\)是非线性函数,可以是单层的softmax,也可以是一个多层结构的神经网络,\\(y_{t-1}\\)表示上一时刻的输出,\\(f\\)同样是一个RNN结构,\\(s_{t}\\)表示decoder RNN cell的隐状态。 Attention 在Encoder-Decoder中每个目标词生成时使用的都是同一个向量\\(c\\),虽然理论上来讲向量\\(c\\)可以表示输入序列的语义信息,比如一些关键词、句子结构和语法信息等,但也存在注意力分散的问题。在机器翻译中,一般翻译出来的词与源序列的词是有对齐关系的,也就是说目标词的生成与源序列中的部分关键词关系更大,而其他词对当前目标词的生成影响就很小。在Encoder-Decoder中不论生成哪个目标词,使用的语义向量都是\\(c\\),而语义向量\\(c\\)是由句子\\(X\\)的每个单词经过Encoder编码而成的,也就意味着句子\\(X\\)中的关键词对生成任意目标词的影响力是相同的。 第一篇论文在Encoder-Decoder的基础上引入注意力机制,来解决上述注意力分散的问题。在论文中提出,每个目标词生成时使用的语义向量是不同的,也就是说Encoder-Decoder将会学会在生成目标词时给每个源语词分配权重,这个权重表示该源语词对当前目标词的重要程度。增加了attention机制的Encoder-Decoder框架如下图: 在基于attention的模型中,每个目标词生成时的条件概率可以写成: \\[p(y_{i}|y_{<t},X)=g(y_{i-1},s_{i},c_{i})\\] \\[s_{i}=f(s_{i-1},y_{i-1},c_{i})\\] 在RNN中每个时刻的隐状态\\(h_{i}\\)可以表示第\\(i\\)个源语词及其周围部分词的信息,因此与之前的Encoder-Decoder框架不同,语义向量\\(c_{i}\\)不再是encoder RNN最后一个时刻的隐状态,而是与encoder RNN所有时刻的隐状态(\\(h_{1},...,h_{T_{x}}\\))相关的一个向量。 \\[c_{i}=\\sum_{j=1}^{T_{x}}\\alpha_{ij}h_{j}\\] \\(\\alpha_{ij}\\)可以认为是目标词\\(i\\)与源语词\\(j\\)的对齐权重,因此可以使用源语词\\(i\\)的隐状态\\(h_{i}\\)和目标词前一时刻的隐状态\\(s_{i-1}\\)来计算。 \\[\\alpha_{ij}=\\frac{\\exp(e_{ij})}{\\sum_{k=1}^{T_{x}}\\exp(e_{ik})}\\] 其中 \\[e_{ij}=a(s_{i-1},h_{j})\\] \\(a\\)是一个对齐模型,在Bahdanau的论文中将其定义成一个前馈神经网络,与Encoder-Decoder一起参与训练。计算公式如下: \\[a(s_{i-1},h_{j})=v_{a}^\\mathsf{T}\\cdot tanh(W_{a}s_{i-1}+U_{a}h_{j}) \\] \\(v_{a}\\)、\\(W_{a}\\)和\\(U_{a}\\)都是对齐模型的参数。在第二篇ThangLuong的论文中提出下面三种计算方式,本质上也是大同小异。 下图是Bahdanau在论文中给出的一个模拟图,图中模拟的是在给定源语序列(\\(X_{1},X_{2},...,X_{T}\\))的情况下生成第\\(t\\)个目标词\\(y_{t}\\)的过程。 Encoder 在Bahdanau的论文中Encoder和Decoder使用的都是GRU(Gated Recurrent Unit),GRU与LSTM一样都是RNN众多变体中比较常见的一种,也可以使用其他变体RNN,比如在ThangLuong的论文中主要用的就是LSTM。 我们知道传统的RNN理论上可以记忆无限长的序列,但由于递归权重对每个时刻的输入都是一样的,这就导致一个二选一的问题:(1) 模型发散,无法收敛(2)梯度消失,无法产生长时记忆。GRU和LSTM一样,都是通过引入门(gate)的机制来解决传统RNN梯度消失的问题,gate打开和关闭是由当前时刻的输入和前一时刻的隐层状态控制的,也就是说每个时刻gate的状态都是不同的,一些需要长时间记忆的信息会通过gate一直传递下去,从而学习到长距离依赖。 传统RNN的隐层计算公式:\\(h_{t}=g(W^{hh}h_{t-1}+W^{hx}x_{t})\\),\\(W^{hh}\\)是递归权重,\\(W^{hx}\\)是隐层的权重。实际上,LSTM和GRU都可以认为是对\\(h_{t}\\)计算方式的改进。 下面是GRU结构的示意图,输入为\\(h_{t-1}\\)和\\(x_{t}\\),输出为\\(h_{t}\\)。在GRU中存在两个gate,一个是reset gate,一个是update gate,分别对应下图中的\\(r_{t}\\)和\\(z_{t}\\),\\(\\widetilde h_{t}\\)表示候选隐层状态,候选隐层状态与上一时刻的隐层状态\\(h_{t-1}\\)一起更新当前时刻的隐层状态\\(h_{t}\\)。 GRU的计算过程: 1、首先计算重置门\\(r_{t}\\)和更新门\\(z_{t}\\),其中\\(\\sigma\\)表示sigmoid函数 \\[r_{t}=\\sigma(W^{r}x_{t}+U^{r}h_{t-1})\\] \\[z_{t}=\\sigma(W^{z}x_{t}+U^{z}h_{t-1})\\] 2、计算候选隐层状态\\(\\widetilde h_{t}\\),其中\\(r_{t}\\)用来控制历史记忆的传递,如果\\(r_{t}=0\\),那么\\(\\widetilde h_{t}\\)只与当前输入\\(x_{t}\\)有关,历史记忆被重置。 \\[\\widetilde h_{t}=tanh(Wx_{t}+U[r_{t}\\odot h_{t-1}])\\] 实际上仅仅增加一个reset gate就已经可以解决长时依赖的问题,因为如果有需要\\(r_{t}\\)可以总等于1,那么历史记忆就会一直传递下去。但这会带来一个问题,\\(h_{t-1}\\)会累加到当前时刻的隐层状态上产生新的记忆,不断累加的记忆会导致\\(\\widetilde h_{t}\\)达到饱和,最终导致模型无法收敛。为了解决这个问题,GRU可以选择对当前输入产生的新记忆进行遗忘,只传递之前的历史记忆,也就是说我们允许GRU舍弃一些对后续无关的输入信息,保证记忆都是有效信息。GRU是通过下面的更新操作来实现这个过程的, \\[h_{t}=z_{t}\\odot h_{t-1}+(1-z_{t})\\odot \\widetilde h_{t}\\] \\(z_{i}\\)反映了相对历史记忆当前输入信息的重要程度,\\(z_{i}\\)越小表明当前输入信息越重要。 实际上在Bahdanau的论文中使用的是双向RNN(BiRNN),BiRNN在前向RNN的基础上增加了一个反向RNN,使得RNN可以同时看到历史和未来的信息,最终前向RNN的隐层状态和反向RNN的隐层状态拼接后输出。 \\[h_{i}=\\left [ \\begin{align} & \\vec{h_{i}} \\\\ & \\stackrel{\\leftarrow}{h_{i}} \\end{align}\\right ]\\] Decoder 在Bahdanau的论文中decoder采用是一个前向的GRU,但与encoder GRU不同的是decoder GRU需要额外输入语义向量\\(c_{i}\\)。decoder GRU隐层状态\\(s_{i}\\)的计算如下: \\[s_{i}=(1-z_{i})\\odot s_{i-1}+z_{i}\\odot \\widetilde s_{i}\\] 其中, \\[\\widetilde s_{i}=tanh(Wy_{i-1}+U[r_{i}\\odot s_{i-1}]+Cc_{i})\\] \\[r_{i}=\\sigma(W_{r}y_{i-1}+U_{r}s_{i-1}+C_{r}c_{i})\\] \\[z_{i}=\\sigma(W_{z}y_{i-1}+U_{z}s_{i-1}+C_{z}c_{i})\\] encoder GRU的隐层状态会被传递到decoder GRU用于生成第一个目标词,所以decoder GRU的隐层状态的初始值不是0,而是将encoder中反向GRU第一个时刻的隐层状态直接复制给decoder GRU,即\\(s_{0}=tanh(W_{s}\\stackrel{\\leftarrow}{h_{1}})\\)。 beam search","categories":[{"name":"neural machine translation","slug":"neural-machine-translation","permalink":"https://hjchen2.github.io/categories/neural-machine-translation/"}],"tags":[{"name":"seq2seq","slug":"seq2seq","permalink":"https://hjchen2.github.io/tags/seq2seq/"},{"name":"machine translation","slug":"machine-translation","permalink":"https://hjchen2.github.io/tags/machine-translation/"},{"name":"Encoder-Decoder","slug":"Encoder-Decoder","permalink":"https://hjchen2.github.io/tags/Encoder-Decoder/"},{"name":"Attention","slug":"Attention","permalink":"https://hjchen2.github.io/tags/Attention/"}]},{"title":"阿里KunPeng框架学习","slug":"KunPeng论文阅读","date":"2017-08-22T04:53:08.000Z","updated":"2023-02-07T02:49:01.177Z","comments":true,"path":"2017/08/22/KunPeng论文阅读/","link":"","permalink":"https://hjchen2.github.io/2017/08/22/KunPeng%E8%AE%BA%E6%96%87%E9%98%85%E8%AF%BB/","excerpt":"KunPeng是阿里最新公布的一个大规模机器学习框架,不仅包括了数据/模型并行、负载均衡、模型同步、稀疏表达、工业级容错等特性,而且还提供了易于使用的接口,在很多机器学习算法上都带来了非常大的性能提升。 原始论文 KunPeng: Parameter Server based Distributed Learning Systems and Its Applications in Alibaba and Ant Financial。","text":"KunPeng是阿里最新公布的一个大规模机器学习框架,不仅包括了数据/模型并行、负载均衡、模型同步、稀疏表达、工业级容错等特性,而且还提供了易于使用的接口,在很多机器学习算法上都带来了非常大的性能提升。 原始论文 KunPeng: Parameter Server based Distributed Learning Systems and Its Applications in Alibaba and Ant Financial。 Introduction 主要对一些通用分布式计算框架进行比较。 Hadoop:只提供了一些粗粒度的操作,比如Map、Reduce和Join等。很多限制导致基于Hadoop的机器学习算法效率都非常低,这些限制包括中间结果会落盘、只能在shuffling阶段进行数据交换等。 Spark:使用RDD弥补了Hadoop的一些缺点,提供MLlib库,MLlib整合了很多机器学习算法,并且非常容易使用。但MLlib只支持中等规模的特征,计算和通信效率都比较低。一些公司使用第三方组件来弥补Spark的缺陷,但至今没有一个完美的方案。 GraphLab和GraphX:基于图的并行计算框架,允许用户进行细粒度的控制,但并不适合通用的机器学习算法,比如LR、深度学习等,并且也存在效率低的问题。 MPI:接口灵活高效,代码自由度比较高,比如在代码中所有进程之间可以随时通信。但使用MPI开发一个新算法的开销非常大,比如一个复杂的异步矩阵分解算法需要2000多行代码。MPI没有提供分布式ML平台通用的组件,比如分布式数据读取,内存管理和多线程并行的组件。更重要的是MPI没有提供单点失败的本地解决方案,根据他们的统计数据显示MPI作业在节点数越多时失败率越高。 parameter server框架:包含无状态的workers和有状态的servers,workers负责大部分的计算任务,servers负责保存和更新模型参数。servers可以定期将模型参数快照保存到一个缓存位置,一旦有节点失败,parameter server会自动从最新的checkpoint中恢复模型参数。parameter server框架只支持pserver和worker之间通信, 而pserver和pserver、worker和worker之间无法进行点对点通信,并且由于细粒度的接口导致用户编程比较复杂,因此现有的parameter server框架还存在几个问题:一是通信接口比较单一,没有MPI灵活;二是对于用户来说没有Spark易于编程使用。 正是由于上述框架的种种缺点,他们开发了一个产品级的分布式学习系统—KunPeng。KunPeng结合了parameter server和MPI的优点,提供鲁棒的failover机制,高效的稀疏数据通信接口和与MPI类似的通用接口,并且提供一个C++和Python的SDK,该SDK提供了一个类似单机的开发环境。KunPeng也与阿里的Apsara平台深度对接,提供ML的全工具集,包括基于SQL和MapReduce的数据预处理、预测、评估等等。 KunPeng整体架构 Apsara Cloud Platform Apsara是阿里开发的一个大规模分布式操作系统,目前已运行在跨数十个机房的十几万台服务器上。下图中天蓝色部分就是Apsara的模块,白色部分为运行在Apsara之上的各种云服务,KunPeng就属于图中白色部分,运行在Apsara上,由Apsara提供任务调度和监控、文件系统等服务。 图中红色边框的任务调度模块和资源管理模块被统称为Fuxi(伏羲),Fuxi支持多种特性以保证系统的可扩展性和容错性,这些特性包括:增量资源管理协议、用户透明的失败恢复、故障点自动检测和多级黑名单机制。 KunPeng Platform KunPeng分为ML-Bridge和PS-Core两个子系统,ML-Bridge是KunPeng提供的高级编程模型,用户通过脚本编程的workflow可以方便地实现数据预处理、训练、预测和评估等算法,PS-Core是一个分布式键值对存储的paramter server框架。 ML-Bridge由三个组件构成: 解释器。将用户的脚本解释为系统支持的算法 优化器。根据运行状态的历史统计和启发式方法,分析、调试和优化作业配置 执行器。根据作业的配置生成Fuxi调度的配置,提供整个作业生命周期的监控,并提供用户监控UI ML-Bridge简化了用户编程,比如一个算法流程包括数据入库与预处理、训练、评估和AB测试几个流程,在KunPeng中只需要调用下图中的几行命令就可以实现。整个流程对用户来说都是透明的,用户也不需要关心算法的具体实现和作业调度过程。 ede2df215585fc86358bc9868565d1ce PS-Core不仅支持数据并行和模型并行,同时还支持模型同步更新(BSP)、ASP和SSP,稀疏表达和容错机制。 PS-Core在传统的worker和server基础上,增加了一个用于迭代控制的coordinator。coordinator声明了数据计算和参数更新的操作,构建了整个ML workerflows的作业图,并将这些作业调度到worker和server上运行,并参与servers和workers的failover过程。coordinator在迭代结束时会与Apsara的meta对迭代状态进行同步,并且由Fuxi监控管理,因此不存在SPOF(单点失败)的问题。 容错方案 KunPeng也给出了servers和workers的容错解决方案。对于servers,它们会异步地将参数快照保存到分布式文件系统,并且它们会在内存中对参数进行两备份,支持hot failover加速恢复过程。大多数情况下(比如接收到coordinator的恢复请求),servers可以立刻通过内存备份的参数中恢复。即使是servers或整个任务被中断或被kill,servers也可以通过最近一次保存的参数进行恢复训练。对于stateless的workers,failover非常简单,只需要从servers上pull对应的参数。对于stateful的workers,同样提供保存快照的接口,因此对于一些workers有本地状态的算法(比如LDA),faliover也非常简单。 总的来说,KunPeng的failover过程是当Fuxi检测到有节点失败时,重新调度新的节点,同时给coordinator发送异步节点失败的消息,coordinator接收消息后给servers和workers发送恢复请求,对于正常的servers接收请求后会直接从内存中恢复,而对于新调度的servers会从checkpoint中恢复,对于workers需要先从servers上pull对应的参数,stateful的workers还需要从保存的checkpoint中恢复状态。 DAG调度 这里的调度指的是coordinator对servers和workers的调度。由于coordinator节点会根据算法的workflow构建对应的作业DAG,并将DAG调度到servers和workers上进行执行。为了提高机器资源利用率和作业效率,DAG中相同深度的节点可以并行执行,比如下图中的Calculate for Block 0节点和Load Data for Block 1节点。通过DAG接口用户可以自定义IO操作、计算和通信过程,可以很方便地实现各种模型更新算法。 e76cf7c13015b83ed7696b5fa7c8dac0 下图表示了PS-Core中bounded delay ASGD算法的C++实现,用户可以重写下面的Iterate函数实现自定义的算法。图中的mServerParam和mServerGrad对应servers上的模型参数和梯度,mWorkerParam和mWorkerGrad对应workers本地的模型参数和梯度,mSubDatasetPtr对应当前worker的数据子集。nSync为最大延迟迭代次数,nPull和nPush分别为从servers获取最新参数和将梯度发送给servers的频率。通过设置nSync、nPull和nPush可以很方便地在BSP和SSP之间切换,而去除SyncBarrier就成了ASP算法的实现。 69ed0d3573fbebf558494bc4a9a14c74 负载均衡和通信接口 由于集群中机器的底层硬件和运行状态存在差异,因此一个任务的执行效率很大程度上取决于运行最慢的那个机器,针对这种情况可以有多种负载均衡的方法,比如可以对负载较高的机器分配更少的数据和计算量,PS-Core也为此设计了一个Backup instance机制。当某个节点被确定为慢节点时,coordinator会把慢节点标记为\"dead\"节点,请求Fuxi重新调度一个新的节点作为该节点的备份节点,并将该节点的负载转移到备份节点上。这种机制通常可以带来10%-20%的效率提升。 KunPeng对不同稀疏度和不同数据类型的数据通信做了深度优化,并且提供workers之间点对点的通信接口,比如AllReduce,ReduceTo和Bcast,这些灵活的通信接口使得KunPeng可以拓展更多的功能,比如模型并行。 FTRL \\[w_{t+1}=\\mathop{\\arg\\min}_{w}\\left(\\sum_{s=1}^{t}g_{s}w+\\frac{1}{2}\\sum_{s=1}^{t}\\delta_{s}{\\Vert}w-w_{s}{\\Vert}_{2}^{2}+\\lambda_{1}{\\Vert}w{\\Vert}_{1}+\\lambda_{2}{\\Vert}w{\\Vert}_{2}^{2}\\right)\\] 其中\\(g\\)为损失函数对\\(w\\)的梯度,\\(\\delta_{t}=\\frac{1}{\\eta_{t}}-\\frac{1}{\\eta_{t-1}}\\),因此\\(\\sum_{s=1}^{t}{\\delta_{s}}=\\frac{1}{\\eta_{t}}\\),\\(\\eta\\)为学习率,并且\\(\\eta_{t,i}=\\frac{\\alpha}{\\beta+\\sqrt{\\sum_{s=1}^{s}{g_{s,i}^2}}}\\),通常\\(\\alpha=1\\),\\(\\beta\\)是与数据集和特征相关的超参数。\\(\\lambda_{1}\\)为L1系数,\\(\\lambda_{2}\\)为L2系数。 更新公式为 \\[w_{t+1}=\\begin{cases}0& if\\ {\\vert}z_{i}{\\vert}{\\leq}\\lambda_{1}\\\\ -(\\frac{\\beta+\\sqrt{n_{i}}}{\\alpha}+\\lambda_{2})^{-1}(z_{i}-sign(z_{i})\\lambda_{1})& otherwise\\end{cases}\\] 下图表明了LR FTRL-Proximal算法单机更新过程。 66cf72a181547ae24831af8500b47d72 这个算法在单机时很容易实现,但在分布式环境必须要考虑通信效率、servers的负载和算法收敛性问题。考虑到BSP的低效和ASP可能不收敛的问题,他们使用了bounded delay的SSP更新方法,并且设置trust region来调节参数范围,避免模型发散。整个算法具体过程如下: workers本地保存了模型\\(w\\)和\\(z\\)、\\(n\\),\\(z\\)、\\(n\\)通过bounded-asynchronous的方式与servers保持同步 workers加载数据,根据\\(z\\)和\\(n\\)更新本地模型\\(w\\),计算梯度并更新本地模型\\(w\\)和\\(z\\)、\\(n\\),同时使用\\(\\delta_{z}\\)和\\(\\delta_{n}\\)累加\\(z\\)和\\(n\\)的增量,在需要与servers同步的时候将累加的\\(\\delta_{z}\\)和\\(\\delta_{n}\\) push到servers servers合并所有workers发送的\\(\\delta_{z}\\)和\\(\\delta_{n}\\),最后更新全局\\(z\\)和\\(n\\)。 workers向servers传递\\(z\\)和\\(n\\)的增量,而不是直接传递模型梯度,这种做法虽然会带来一些通信开销,但降低了servers的计算负载,这是在通信效率和计算负载之间做的平衡。为了避免发散,servers在trust region下更新模型。trust region的策略有两种:一种是当模型中的元素超出置信阈时,直接回退整个模型;另一种是通过映射的方式将模型的值限制在置信阈中。 0de2241d38a792bb79446944d65d8c66 MART MART(多增量回归树)又叫做GBDT,是一种应用比较广泛的机器学习算法。KunPeng实现了一个通用的MART算法,支持千亿级样本量和上千维的特征,并在MART的基础上实现了LambdaMART算法。 MART 为了处理超大规模的数据量,KunPeng-MART使用数据并行的方式减少内存使用量,并采用了XGBoost的分布式加权直方图算法优化分裂点查找过程。具体来说就是,每个worker都保存了整颗树,在分割叶节点时, (1)每个worker使用分配的数据子集计算一个局部加权直方图,计算完成后将直方图push到servers (2)servers收到workers发送的直方图后,采用多路合并算法得到全局直方图,并找到最优分割点 (3)workers从servers pull分割点,分裂节点并将数据分到分裂后的叶节点 重复上述过程,可以得到整棵树。然后只要按照gradient boosting方法一棵一棵地建树,最终得到MART。随着特征维度和树深度的增加,查找分裂点过程中的计算和通信都可能成为性能瓶颈。为了解决这个问题,他们提到使用KunPeng的通信模式去减少合并局部直方图的开销,但并没有透露具体的方法。 LambdaMART LambdaMART建树的过程与上面的MART一样,不同的是LambdaMART计算一阶导数和二阶导数的方式。由于LambdaMART要求同一个query group的训练数据按sample两两组成pair对,因此当训练数据不是按照query group连续存储时就会存在问题。对于这个问题,他们提出了两种解决方法: (1)先全局统计一下每个query id对应的样本总数,然后按照multiway number partitioning algorithm对query id进行分片,每个worker只加载属于自己的query ids对应的训练样本。 (2)第二种是近似的方法。首先要求相同query id的样本在文件系统中是连续存储的,然后每个worker还是按照正常情况加载属于自己的分片数据。如果相同query id的样本被分在两个不同的worker上,则会把这两个worker上相同query id的样本当做不同query id来处理。 其他算法 Large-scale sparse Logistic Regression (LR) 实现了不同的优化算法,L-BFGS、OWL-QN和BCD,其中BCD算法是数据和模型同时并行的算法。 Distributed Factorization Machines workers异步计算梯度,使用AdaGrad优化算法 Caffe 实现了Caffe和KunPeng的对接,a generalized CPU-based large-scale deep learning platform,简化DL算法开发 实验结果 下面的实验都是在一个拥有5000台服务器的正式集群上进行的,每台机器12个Intel Xeon CPU E5-2430 (2.2 GHz) CPU和96GB内存。 KunPeng、Spark和MPI的LR算法对比 143e082b7f1a6b54e47e9c8b51026dbb 不同平台的LR都采用L-BFGS算法更新,并且memory history parameter都设置为10,并且使用同一个集群相同的CPU资源,在7个不同的数据集上KunPeng在效率和内存占用上都取得非常明显的优势。 在另外一个18 billion样本和 7 billion特征的数据集上,他们统计了KunPeng在不同workers数量时的加速比。 00c84f368394ba04d59dbe530f69c387 KunPeng仅使用25个workers就可以训练这么大的数据,workers增加时依然能保持较高的加速比,并且内存占用随着workers增加而近乎直线降低。 KunPeng-MART和XGBoost的对比 下图分别为KunPeng-MAR和XGBoost在不同任务上的峰值内存占用和训练时间对比。 1b0888cab293242eaccdc2b6e5bf25d9 3b99dc82bc268d3da394a688c0234908 KunPeng-FM、LibFM和DiFacto的对比 下面是在单机情况下的训练效果对比,并没有训练时间的对比数据和多机实验相关的数据。 da511a1bb0db987fb74ebb08fa5352c9 参考资料 1、Ad Click Prediction: a View from the Trenches.","categories":[{"name":"ML framework","slug":"ML-framework","permalink":"https://hjchen2.github.io/categories/ML-framework/"}],"tags":[{"name":"large scale ML framework","slug":"large-scale-ML-framework","permalink":"https://hjchen2.github.io/tags/large-scale-ML-framework/"},{"name":"KunPeng","slug":"KunPeng","permalink":"https://hjchen2.github.io/tags/KunPeng/"}]},{"title":"C++调用python","slug":"C++调用Python接口","date":"2017-07-03T04:31:08.000Z","updated":"2023-01-03T14:04:13.435Z","comments":true,"path":"2017/07/03/C++调用Python接口/","link":"","permalink":"https://hjchen2.github.io/2017/07/03/C++%E8%B0%83%E7%94%A8Python%E6%8E%A5%E5%8F%A3/","excerpt":"由于需要在组内新开发的一套机器学习框架上开发一个强化学习的demo,但目前开源的一些游戏环境都只提供了python接口,比如Gym。如果要使用Gym去做在线训练的话,就需要在C++代码中调用Python接口,因此找了些例子学习了一下如何使用Python C API。当然Python C API不是唯一的方式,也可以使用boost的Python模块,有时间再研究。","text":"由于需要在组内新开发的一套机器学习框架上开发一个强化学习的demo,但目前开源的一些游戏环境都只提供了python接口,比如Gym。如果要使用Gym去做在线训练的话,就需要在C++代码中调用Python接口,因此找了些例子学习了一下如何使用Python C API。当然Python C API不是唯一的方式,也可以使用boost的Python模块,有时间再研究。 hello python 1234567891011#include <stdio.h>#include <iostream>#include "python/Python.h"int main() { Py_Initialize(); std::cout << "hello c++!" << std::endl; PyRun_SimpleString("print 'hello python!'"); Py_Finalize(); return 0;} 编译: 1g++ test.cpp -o test -lpython 执行:./test 12hello c++!hello python! 调用python脚本中的函数 123# test_add.pydef add(a, b): return a+b 123456789101112131415161718192021222324252627282930313233343536373839404142434445#include <stdio.h>#include <iostream>#include "python/Python.h"int main(int argc, char* argv[]) { if (argc < 3) { std::cerr << "Usage: ./exe integer1 integer2" << std::endl; return 1; } std::cerr << "hello c++!" << std::endl; Py_Initialize(); PyRun_SimpleString("import sys"); PyRun_SimpleString("sys.path.append('./')"); PyRun_SimpleString("print 'hello python!'"); PyObject* moduleName = PyString_FromString("test_add"); PyObject* pModule = PyImport_Import(moduleName); if (!pModule) { std::cerr << "[ERROR] Python get module failed." << std::endl; return 1; } PyObject* pv = PyObject_GetAttrString(pModule, "add"); if (!pv || !PyCallable_Check(pv)) { std::cerr << "[ERROR] Can't find function (add)" << std::endl; return 1; } PyObject* args = PyTuple_New(2); PyObject* arg1 = PyInt_FromLong(atoi(argv[1])); PyObject* arg2 = PyInt_FromLong(atoi(argv[2])); PyTuple_SetItem(args, 0, arg1); PyTuple_SetItem(args, 1, arg2); PyObject* pRet = PyObject_CallObject(pv, args); if (!pRet) { std::cerr << "[ERROR] Call funftion (add) failed" << std::endl; return 1; } long result = PyInt_AsLong(pRet); std::cout << "result: " << result << std::endl; Py_Finalize(); return 0;} 编译: 1g++ test.cpp -o test -lpython 执行:./test 3 4 123hello c++!hello python!result: 7 Q学习的一个例子 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172# tree.py"""author: Houjiang Chen"""import randomclass q_learning: def __init__(self, states, actions): self.states = states self.actions = actions self.eps = 0.1 self.alpha = 0.1 self.q_table = [[0 for j in range(actions)] for i in range(states)] def get_action(self, current_state): max_action = self.q_table[current_state].index(max(self.q_table[current_state])) if random.uniform(0, 1) > self.eps: return max_action else: rest = [i for i in range(len(self.q_table[current_state])) if i != max_action] index = random.randint(0, len(rest) - 1) return rest[index] def update(self, current_state, action, next_state, reward, final): if not final: reward = reward + max(self.q_table[next_state]) self.q_table[current_state][action] += self.alpha * (reward - self.q_table[current_state][action])class environment: def __init__(self): self.level = 2 self.actions = 2 self.states = self.actions ** (self.level + 1) - 1 self.final_states = self.actions ** self.level self.reward = {0 : [10, -10], 1 : [50, 100], 2 : [100, 150]} def next(self, current_state, action): """action: 0 or 1 return: next_state reward, is_final """ next = 2 * current_state + (action + 1) if next >= self.states - self.final_states: return None, self.reward[current_state][action], True else: return next, self.reward[current_state][action], False def reset(self): return random.randint(0, self.states - self.final_states - 1)def main(): env = environment() agent = q_learning(env.states, env.actions) episode = 0 while episode < 10000: episode += 1 print "episode: %d" % episode current_state = env.reset() while True: action = agent.get_action(current_state) next_state, reward, final = env.next(current_state, action) agent.update(current_state, action, next_state, reward, final) if final: break current_state = next_state print agent.q_tableif __name__ == '__main__': main() 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120#include <stdio.h>#include <iostream>#include "python2.7/Python.h"PyObject* New_PyInstance(PyObject* cls, PyObject* args) { PyObject* pInstance = PyInstance_New(cls, args, NULL); if (!pInstance) { std::cerr << "new instance failed" << std::endl; exit(1); } return pInstance;}int main(int argc, char* argv[]) { Py_Initialize(); PyRun_SimpleString("import sys"); PyRun_SimpleString("sys.path.append('./')"); PyObject* moduleName = PyString_FromString("tree"); PyObject* pModule = PyImport_Import(moduleName); if (!pModule) { std::cerr << "[ERROR] Python get module failed." << std::endl; return 1; } PyObject* pEnv = PyObject_GetAttrString(pModule, "environment"); if (!pEnv) { std::cerr << "[ERROR] Can't find class (environment)" << std::endl; return 1; } PyObject* pEnvObject = New_PyInstance(pEnv, NULL); PyObject* pEnvLevel = PyObject_GetAttrString(pEnvObject, "level"); if (!pEnvLevel) { std::cerr << "[ERROR] Env has no attr level" << std::endl; return 1; } PyObject* pEnvActions = PyObject_GetAttrString(pEnvObject, "actions"); PyObject* pEnvStates = PyObject_GetAttrString(pEnvObject, "states"); PyObject* pEnvFinalState = PyObject_GetAttrString(pEnvObject, "final_states"); int level = PyInt_AsLong(pEnvLevel); int actions = PyInt_AsLong(pEnvActions); int states = PyInt_AsLong(pEnvStates); int final_state = PyInt_AsLong(pEnvFinalState); std::cout << "env level: " << level << std::endl; std::cout << "env actions: " << actions << std::endl; std::cout << "env states: " << states << std::endl; std::cout << "env final_state: " << final_state << std::endl; PyObject* pLearn = PyObject_GetAttrString(pModule, "q_learning"); PyObject* pLearnArgs = Py_BuildValue("ii", states, actions); PyObject* pLearnObject = New_PyInstance(pLearn, pLearnArgs); PyObject* pLearnStates = PyObject_GetAttrString(pLearnObject, "states"); PyObject* pLearnActions = PyObject_GetAttrString(pLearnObject, "actions"); PyObject* pLearnEps = PyObject_GetAttrString(pLearnObject, "eps"); int learn_states = PyInt_AsLong(pLearnStates); int learn_actions = PyInt_AsLong(pLearnActions); float learn_eps = PyFloat_AsDouble(pLearnEps); std::cout << "learn_states: " << learn_states << std::endl; std::cout << "learn_actions: " << learn_actions << std::endl; std::cout << "learn_eps: " << learn_eps << std::endl; PyObject* pEnvResetFunc = PyObject_GetAttrString(pEnvObject, "reset"); PyObject* pEnvNextFunc = PyObject_GetAttrString(pEnvObject, "next"); PyObject* pLearnGetActionFunc = PyObject_GetAttrString(pLearnObject, "get_action"); PyObject* pLearnUpdateFunc = PyObject_GetAttrString(pLearnObject, "update"); if (!pEnvNextFunc) { std::cerr << "[ERROR] env has no function named next" << std::endl; return 1; } std::cout << std::endl; uint64_t episode = 0; for (episode = 0; episode < 10000; ++episode) { if (episode % 100 == 0) std::cout << "episode: " << episode << std::endl; PyObject* current_state = PyEval_CallObject(pEnvResetFunc, NULL); while (true) { PyObject* args1 = PyTuple_New(1); PyObject* args2 = PyTuple_New(2); PyTuple_SetItem(args1, 0, current_state); PyObject* action = PyEval_CallObject(pLearnGetActionFunc, args1); PyTuple_SetItem(args2, 0, current_state); PyTuple_SetItem(args2, 1, action); PyObject* ret = PyEval_CallObject(pEnvNextFunc, args2); PyObject* next_state = PyTuple_GetItem(ret, 0); PyObject* final = PyTuple_GetItem(ret ,2); PyObject* args3 = PyTuple_New(5); PyTuple_SetItem(args3, 0, current_state); PyTuple_SetItem(args3, 1, action); PyTuple_SetItem(args3, 2, next_state); PyTuple_SetItem(args3, 3, PyTuple_GetItem(ret, 1)); PyTuple_SetItem(args3, 4, final); PyEval_CallObject(pLearnUpdateFunc, args3); if (PyObject_IsTrue(final)) { break; } current_state = next_state; if (args3) Py_DECREF(args3); } } PyObject* pLearnQTable = PyObject_GetAttrString(pLearnObject, "q_table"); for (int i = 0; i < PyList_Size(pLearnQTable); ++i) { std::cout << "state " << i << std::endl; PyObject* term = PyList_GetItem(pLearnQTable, i); if (PyList_Check(term)) { for (int j = 0; j < PyList_Size(term); ++j) { std::cout << " direct: " << j << ", " << "Qvalue: " << PyFloat_AsDouble(PyList_GetItem(term, j)) << std::endl; } } } Py_Finalize(); return 0;} 编译: 1g++ test.cpp -o test -I../python2.7.12/include -L../python2.7.12/lib -lpython2.7 执行:./test 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129env level: 2env actions: 2env states: 7env final_state: 4learn_states: 7learn_actions: 2learn_eps: 0.1episode: 0episode: 100episode: 200episode: 300episode: 400episode: 500episode: 600episode: 700episode: 800episode: 900episode: 1000episode: 1100episode: 1200episode: 1300episode: 1400episode: 1500episode: 1600episode: 1700episode: 1800episode: 1900episode: 2000episode: 2100episode: 2200episode: 2300episode: 2400episode: 2500episode: 2600episode: 2700episode: 2800episode: 2900episode: 3000episode: 3100episode: 3200episode: 3300episode: 3400episode: 3500episode: 3600episode: 3700episode: 3800episode: 3900episode: 4000episode: 4100episode: 4200episode: 4300episode: 4400episode: 4500episode: 4600episode: 4700episode: 4800episode: 4900episode: 5000episode: 5100episode: 5200episode: 5300episode: 5400episode: 5500episode: 5600episode: 5700episode: 5800episode: 5900episode: 6000episode: 6100episode: 6200episode: 6300episode: 6400episode: 6500episode: 6600episode: 6700episode: 6800episode: 6900episode: 7000episode: 7100episode: 7200episode: 7300episode: 7400episode: 7500episode: 7600episode: 7700episode: 7800episode: 7900episode: 8000episode: 8100episode: 8200episode: 8300episode: 8400episode: 8500episode: 8600episode: 8700episode: 8800episode: 8900episode: 9000episode: 9100episode: 9200episode: 9300episode: 9400episode: 9500episode: 9600episode: 9700episode: 9800episode: 9900state 0 direct: 0, Qvalue: 110 direct: 1, Qvalue: 140state 1 direct: 0, Qvalue: 50 direct: 1, Qvalue: 100state 2 direct: 0, Qvalue: 100 direct: 1, Qvalue: 150state 3 direct: 0, Qvalue: 0 direct: 1, Qvalue: 0state 4 direct: 0, Qvalue: 0 direct: 1, Qvalue: 0state 5 direct: 0, Qvalue: 0 direct: 1, Qvalue: 0state 6 direct: 0, Qvalue: 0 direct: 1, Qvalue: 0 参考资料 Python/C API Reference Manual: https://docs.python.org/2/c-api/index.html","categories":[{"name":"code","slug":"code","permalink":"https://hjchen2.github.io/categories/code/"}],"tags":[{"name":"c++","slug":"c","permalink":"https://hjchen2.github.io/tags/c/"},{"name":"python","slug":"python","permalink":"https://hjchen2.github.io/tags/python/"},{"name":"embedding","slug":"embedding","permalink":"https://hjchen2.github.io/tags/embedding/"}]},{"title":"多节点异步更新中momentum的影响","slug":"ASGD中momentum的影响","date":"2017-06-21T04:31:08.000Z","updated":"2023-01-03T14:04:03.846Z","comments":true,"path":"2017/06/21/ASGD中momentum的影响/","link":"","permalink":"https://hjchen2.github.io/2017/06/21/ASGD%E4%B8%ADmomentum%E7%9A%84%E5%BD%B1%E5%93%8D/","excerpt":"这几天的主要工作是将caffe移植到组内新开发的某个计算框架,在验证正确性时遇到一个问题。由于计算框架只支持异步更新的方式,因此采用全异步SGD算法训练Alexnet时非常容易发散。另外调研了一下近期发布的异步更新算法DC-ASGD,实验结果只能说对收敛有些正向效果,仍无法解决训练发散的问题。在另外一个DNN的网络上发现在多机时momentum对收敛结果有较大影响,momentum会导致收敛出现较大波动。","text":"这几天的主要工作是将caffe移植到组内新开发的某个计算框架,在验证正确性时遇到一个问题。由于计算框架只支持异步更新的方式,因此采用全异步SGD算法训练Alexnet时非常容易发散。另外调研了一下近期发布的异步更新算法DC-ASGD,实验结果只能说对收敛有些正向效果,仍无法解决训练发散的问题。在另外一个DNN的网络上发现在多机时momentum对收敛结果有较大影响,momentum会导致收敛出现较大波动。 网上找了一圈,似乎也就这个有些参考价值: http://stanford.edu/~imit/tuneyourmomentum/theory/ 看来近期得做一些调momentum和学习率的实验了。。。","categories":[{"name":"deep learning","slug":"deep-learning","permalink":"https://hjchen2.github.io/categories/deep-learning/"}],"tags":[{"name":"caffe","slug":"caffe","permalink":"https://hjchen2.github.io/tags/caffe/"},{"name":"deep learning","slug":"deep-learning","permalink":"https://hjchen2.github.io/tags/deep-learning/"},{"name":"momentum","slug":"momentum","permalink":"https://hjchen2.github.io/tags/momentum/"}]},{"title":"强化学习(二)","slug":"强化学习(二)","date":"2017-04-25T04:31:08.000Z","updated":"2023-05-19T03:51:57.657Z","comments":true,"path":"2017/04/25/强化学习(二)/","link":"","permalink":"https://hjchen2.github.io/2017/04/25/%E5%BC%BA%E5%8C%96%E5%AD%A6%E4%B9%A0%EF%BC%88%E4%BA%8C%EF%BC%89/","excerpt":"DQN 前面我们讲到TD算法结合了动态规划和蒙特卡洛算法的优点,不依赖具体的环境模型,并且更新时采用滑动平均的方式,因此单步就能更新,而不需要生成整个episode,在非episode情况下仍然适用。TD算法又分为on policy的sarsa算法和off policy的Q learning算法,其中Q learning算法直接使用下一状态的最大动作值函数进行更新,加快了算法收敛速度,因此Q learning算法在实际应用中更加普遍。","text":"DQN 前面我们讲到TD算法结合了动态规划和蒙特卡洛算法的优点,不依赖具体的环境模型,并且更新时采用滑动平均的方式,因此单步就能更新,而不需要生成整个episode,在非episode情况下仍然适用。TD算法又分为on policy的sarsa算法和off policy的Q learning算法,其中Q learning算法直接使用下一状态的最大动作值函数进行更新,加快了算法收敛速度,因此Q learning算法在实际应用中更加普遍。 Q learning例子 我们用一个例子来说明Q learning算法的过程。下图是一个二叉树表示的路径规划问题,每一个节点代表环境中的一个状态,叶子节点表示终止状态,每个非叶子节点都可以选择向上或向下的动作,然后转移到下一个节点,并获得相应的得分。 首先初始化所有状态动作对的动作值函数:\\(Q(S_{i},a)=0, \\forall i\\in[1,6],a\\in[上, 下]\\),并且初始化\\(\\epsilon = 0.1,\\alpha = 0.1\\)。 随机选择一个初始状态\\(S\\),假设为\\(S_0\\) 根据\\(\\epsilon-greedy\\)策略选择一个动作,假设为上,转移到状态\\(S_1\\),那么更新\\(Q(S_0,上)=Q(S_0,上)+\\alpha\\cdot(R_{1}+\\max_aQ(S_1,a)-Q(S_0,上))=0+0.1\\cdot(10+0-0)=1\\),接下来继续根据\\(\\epsilon-greedy\\)策略选择下一个动作,比如下,并且转移到终止状态\\(S_4\\),因此\\(Q(S_1,下)=Q(S_0,下)+\\alpha\\cdot(R_{2}+\\max_aQ(S_4,a)-Q(S_1,下))=0+0.1\\cdot(100+0-0)=10\\)。 随机选择一个初始状态\\(S\\),假设为\\(S_2\\) 根据\\(\\epsilon-greedy\\)策略选择一个动作,假设为上,转移到终止状态\\(S_5\\),则更新\\(Q(S_2,上)=0+0.1\\cdot(100+0-0)=10\\) 随机选择一个初始状态\\(S\\),假设为\\(S_0\\) 根据\\(\\epsilon-greedy\\)策略选择一个动作,假设为上,转移到状态\\(S_1\\),则更新\\(Q(S_0,上)=1+0.1\\cdot(10+10-1)=2.9\\),选择下一个动作,比如上,则\\(Q(S_1,上)=0+0.1\\cdot(50+0-0)=5\\) 随机选择一个初始状态\\(S\\),假设为\\(S_0\\) 根据\\(\\epsilon-greedy\\)策略选择一个动作,假设为上,转移到状态\\(S_1\\),则更新\\(Q(S_0,上)=2.9+0.1\\cdot(10+10-2.9)=4.61\\),选择下一个动作,比如下,则\\(Q(S_1,下)=10+0.1\\cdot(100+0-10)=19\\) … 下面是该例子的python实现: 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768"""author: Houjiang Chen"""import randomclass q_learning(object): def __init__(self, states, actions): self.states = states self.actions = actions self.eps = 0.1 self.alpha = 0.1 self.q_table = [[0 for j in range(actions)] for i in range(states)] def get_action(self, current_state): max_action = self.q_table[current_state].index(max(self.q_table[current_state])) if random.uniform(0, 1) > self.eps: return max_action else: rest = [i for i in range(len(self.q_table[current_state])) if i != max_action] index = random.randint(0, len(rest) - 1) return rest[index] def update(self, current_state, action, next_state, reward, final): if final != 1: reward = reward + max(self.q_table[next_state]) self.q_table[current_state][action] += self.alpha * (reward - self.q_table[current_state][action]) class environment(object): def __init__(self): self.level = 2 self.actions = 2 self.states = self.actions ** (self.level + 1) - 1 self.final_states = self.actions ** self.level self.reward = {0 : [10, -10], 1 : [50, 100], 2 : [100, 150]} def next(self, current_state, action): """action: 0 or 1 return: next_state, reward, is_final """ next = 2 * current_state + (action + 1) if next >= self.states - self.final_states: return None, self.reward[current_state][action], 1 else: return next, self.reward[current_state][action], 0 def reset(self): return random.randint(0, self.states - self.final_states - 1) env = environment()agent = q_learning(env.states, env.actions)episode = 0while episode < 100000: episode += 1 print "episode: %d" % episode current_state = env.reset() while True: action = agent.get_action(current_state) next_state, reward, final = env.next(current_state, action) agent.update(current_state, action, next_state, reward, final) if final: break current_state = next_stateprint agent.q_table 最终收敛结果为: 1234[[109.99999999999989, 139.99999999999977], [49.99999999999997, 99.99999999999994], [99.99999999999994, 149.9999999999999], [0, 0], [0, 0], [0, 0], [0, 0]] 函数逼近 上面的例子中非终止状态数只有3个,每个非终止状态对应的动作只有2个,因此状态动作对总共有6个,使用表格存储完全没有问题,但实际上我们需要解决的并不是一个如此简单的问题。比如在【Playing Atari with Deep Reinforcement Learning】中DeepMind就使用Q learning使得agent玩Atari 2600游戏的水平超越了人类水平。在Atari 2600游戏中,每个游戏画面都是一个状态,如果每个画面都是像素为84*84的256灰度图像,那么将会产生\\(256^{84\\cdot84}\\)个状态,用表格进行存储将会变得非常不现实。为了解决状态数爆炸的问题,通常可以使用函数逼近的方法。下面有几种函数表示的方式: 并且逼近函数的形式可以采用: Linear combinations of features Neural network Decision tree Nearest neighbour Fourier / wavelet bases ... 下面我们研究的DQN(Deep Q Network)就是采用Deep neural network进行动作值函数逼近的一种方法,结构如下。 为推导方便,假设中间的Network为一层的全连接,即\\(\\hat{V}(s, a)=x(S)^{T}w=\\sum_{j=1}^{n}{x_{j}(S)w_{j}}\\),代价函数选择最小均方误差:\\(J(w)=\\frac{1}{2}(V(s,a)-\\hat{V}(s,a))^2\\),采用随机梯度下降算法进行优化。 \\[\\begin{split}\\frac{\\partial{J(w)}}{\\partial{w}}&=\\left(V(s,a)-\\hat{V}(s,a)\\right)\\frac{\\partial{\\hat{V}(s,a)}} {\\partial{w}} \\\\ &=\\left(V(s,a)-\\hat{V}(s,a)\\right)x(S) \\end{split}\\tag{1-1}\\] \\[\\begin{split}w^k&=w^{k-1}+\\eta \\Delta(w)\\\\&=w^{k-1}-\\eta \\frac{\\partial{J(w)}}{\\partial{w}}\\\\&=w^{k-1}-\\eta \\left(V(s,a)-\\hat{V}(s,a;w^{k})\\right)x(S)\\end{split}\\tag{1-2}\\] 由于我们并没有动作值函数的真实值,因此与Q learning类似,\\(V(s,a,)\\)可以使用下一个状态的动作值函数进行估计,即\\(V(s,a)=V(s,a;w^{k-1})=r+\\gamma \\max_{a^{'}}V(s^{'},a^{'};w^{k-1})\\)。 整个训练过程仍然与Q learning一样,采用\\(\\epsilon-greedy\\)策略选择动作,并按照公式(1-2)更新权重\\(w\\),实际上也就更新了策略的动作值函数。使用值函数逼近的方法不需要枚举每个状态动作对,突破了状态数的限制,使得Q learning在一些复杂任务上得到广泛应用,但仍然没有解决动作数爆炸或者连续动作的问题。 DQN DQN最先出现于DeepMind发表的【Playing Atari with Deep Reinforcement Learning】论文中,由于需要直接输入图像画面,因此论文中使用CNN来表示Q函数,下面简单剖析一下该论文。 使用的是典型的CNN,其结构为: 与一般的CNN有所不同的是,没有pooling层,因为我们这里不是做图像分类,pooling层带来的旋转和数值不变性对分类是有作用的,但在这个任务中对物体的具体位置是非常敏感的,因此移除了pooling层。 Atari原始的游戏帧为210\u0002*160像素的RGB图像,由于该任务对画面色彩不敏感,为了减少计算开销,将游戏帧预处理成84*84的灰度图像。但为了获得动态特征,最终是将前3帧图像与当前帧stack到一起组成一个4*84*84的图像作为CNN的输入,输出为每个动作对应的Q值。 经验回放 现在我们知道可以使用Q learning去估计每个状态的未来回报的期望,并且可以使用CNN去逼近动作值函数,也就是可以使用DQN去解决一个复杂的MDP任务。但在实际应用时会出现更新波动较大,导致收敛非常慢的问题,DeepMind因此使用了一个经验回放(Experience Replay)机制,就是将每步的经验数据\\(<s,a,r,s^{'}>\\)存放在回放内存中,更新时都从回放内存中随机采样一个batch的数据进行更新。 经验回放机制相比标准的DQN有两个好处:首先每一步的经验数据会被保存起来,更新时可以多次使用到经验数据,使得数据利用更高效;此外直接从连续的样本中学习是低效的,因为一个episode内样本具有很强的相关性,随机挑选样本打破了这种相关性,因此减小了更新时的变化,使得更新更加稳定(注:因为同一次实验过程的样本相关性很强,不同实验之间的相关性就显得相对比较小,如果使用连续的样本进行训练,在切换到下一次实验的样本时会导致模型更新不稳定)。 由于内存大小限制,回放内存不可能将所有的经验数据都保存起来,因此只会保留最新的N组经验数据,比较久远的数据就会被遗忘。 训练 DeepMind使用DQN对 ATARI中七个游戏进行了实验,由于每个游戏的得分尺度不一致,因此他们将得分分为正回报、负回报和无回报,正回报得分为1,负回报得分为-1,无回报得分为0。 使用 RMSProp算法进行优化,batch size为32,采用\\(\\epsilon-greedy\\)行动策略,前一百万帧的\\(\\epsilon\\)从1线性减少到0.1,最后固定为0.1。总共训练了一千万帧,并且使用了一百万大小的回放内存。 训练过程伪代码: Gym使用 Gym简介 目前强化学习的研究主要由DeepMind和OpenAI两家在主导,去年底到今年初DeepMind和OpenAI相继开源了自家的3D learning environment平台DeepMind Lab和Universe。DeepMind Lab目前给出的文档和例子都比较少,使用也稍显复杂,所以暂时可以不考虑使用。Universe包含了1000+的游戏环境,并且将程序打包在docker环境中运行,提供与Gym一致的接口。Universe的环境由一个client和一个remote组成,client是一个VNCenv,主要负责接收agent的动作,传递回报和管理本地episode的状态,remote是指在docker环境中运行的程序,remote可以运行在本地、远程服务器或在cloud上。client和remote通过VNC远程桌面系统进行交互,通过WebSocket传递回报、诊断和控制信息。 由于Universe环境提供Gym接口,而Gym是OpenAI去年4月份发布的一套开发和比较强化学习算法的toolkit。Gym本身是可以独立于Universe使用的,并且Universe和Gym中agent代码基本没有什么区别。我们下面就单独讲讲Gym接口和如何使用Gym训练自己的agent。 Gym目前提供python接口,并支持任何的计算框架,比如tensorflow、theano等。强化学习解决的是agent和环境交互的任务,agent根据当前环境状态做出某个动作,然后观察下一个状态和回报,环境根据agent的动作转移到下一个状态,并发送回报。Gym提供的实际上是环境这个角色,每个Gym环境都提供一致的接口。 创建一个Gym环境 创建一个环境时只需要指定环境id,比如agent需要玩Atari Breakout-v0这个游戏,可以如下创建一个Breakout-v0的环境。 12import gymenv = gym.make('Breakout-v0') step 输入agent的动作,返回4个值,分别为: observation:表示agent观察到的下一个状态,比如在一些游戏中,observation为RGB的图像 reward:表示执行输入的动作后得到的回报值 done:表示返回的observation是不是结束状态 info:调试信息,一般没什么用处 1next_state, reward, terminal, _ = env.step(action) reset 在开始一个新的episode时,Gym环境都要reset,获得一个初始状态。 1init_state = env.reset() render render是Gym用来渲染环境状态的函数,当调用该函数时会出现一个动图框。一般agent执行一个动作,环境都要渲染一次,这样就可以实时看到agent的执行情况了。 1env.render() Spaces Gym环境有两个space属性,一个是action_space,一个是observation_space,分别表示该Gym环境下合法的动作和状态。action_space是Gym中的一个Discrete对象,Discrete对象有一个成员n,表示合法的动作数,比如Discrete(2)表示有两个合法动作,编号从0开始,因此两个动作编号为0和1。observation_space是Gym中的一个Box对象,Box的shape表示observation的数据组织方式,比如Box(210, 160, 3)表示合法的observation是一个210*160*3的数组,而Box(4,)表示observation是一个大小为4的向量。 12observation_space = env.observation_space # observation_space: Discrete(6)action_space = env.action_space # action_space: Box(210, 160, 3) Breakout-v0例子 采用了github上Flood Sung的DQN实现,感谢Flood Sung大神的无私贡献。 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193# -----------------------------# File: Deep Q-Learning Algorithm# Author: Flood Sung# Date: 2016.3.21# -----------------------------import tensorflow as tfimport numpy as npimport randomfrom collections import deque# Hyper Parameters:FRAME_PER_ACTION = 1GAMMA = 0.99 # decay rate of past observationsOBSERVE = 100. # timesteps to observe before trainingEXPLORE = 200000. # frames over which to anneal epsilonFINAL_EPSILON = 0#0.001 # final value of epsilonINITIAL_EPSILON = 0#0.01 # starting value of epsilonREPLAY_MEMORY = 50000 # number of previous transitions to rememberBATCH_SIZE = 32 # size of minibatchUPDATE_TIME = 100class BrainDQN: def __init__(self,actions): # init replay memory self.replayMemory = deque() # init some parameters self.timeStep = 0 self.epsilon = INITIAL_EPSILON self.actions = actions # init Q network self.stateInput,self.QValue,self.W_conv1,self.b_conv1,self.W_conv2,self.b_conv2,self.W_conv3,self.b_conv3,self.W_fc1,self.b_fc1,self.W_fc2,self.b_fc2 = self.createQNetwork() # init Target Q Network self.stateInputT,self.QValueT,self.W_conv1T,self.b_conv1T,self.W_conv2T,self.b_conv2T,self.W_conv3T,self.b_conv3T,self.W_fc1T,self.b_fc1T,self.W_fc2T,self.b_fc2T = self.createQNetwork() self.copyTargetQNetworkOperation = [self.W_conv1T.assign(self.W_conv1),self.b_conv1T.assign(self.b_conv1),self.W_conv2T.assign(self.W_conv2),self.b_conv2T.assign(self.b_conv2),self.W_conv3T.assign(self.W_conv3),self.b_conv3T.assign(self.b_conv3),self.W_fc1T.assign(self.W_fc1),self.b_fc1T.assign(self.b_fc1),self.W_fc2T.assign(self.W_fc2),self.b_fc2T.assign(self.b_fc2)] self.createTrainingMethod() # saving and loading networks self.saver = tf.train.Saver() self.session = tf.InteractiveSession() self.session.run(tf.initialize_all_variables()) checkpoint = tf.train.get_checkpoint_state("saved_networks") if checkpoint and checkpoint.model_checkpoint_path: self.saver.restore(self.session, checkpoint.model_checkpoint_path) print "Successfully loaded:", checkpoint.model_checkpoint_path else: print "Could not find old network weights" def createQNetwork(self): # network weights W_conv1 = self.weight_variable([8,8,4,32]) b_conv1 = self.bias_variable([32]) W_conv2 = self.weight_variable([4,4,32,64]) b_conv2 = self.bias_variable([64]) W_conv3 = self.weight_variable([3,3,64,64]) b_conv3 = self.bias_variable([64]) W_fc1 = self.weight_variable([1600,512]) b_fc1 = self.bias_variable([512]) W_fc2 = self.weight_variable([512,self.actions]) b_fc2 = self.bias_variable([self.actions]) # input layer stateInput = tf.placeholder("float",[None,80,80,4]) # hidden layers h_conv1 = tf.nn.relu(self.conv2d(stateInput,W_conv1,4) + b_conv1) h_pool1 = self.max_pool_2x2(h_conv1) h_conv2 = tf.nn.relu(self.conv2d(h_pool1,W_conv2,2) + b_conv2) h_conv3 = tf.nn.relu(self.conv2d(h_conv2,W_conv3,1) + b_conv3) h_conv3_flat = tf.reshape(h_conv3,[-1,1600]) h_fc1 = tf.nn.relu(tf.matmul(h_conv3_flat,W_fc1) + b_fc1) # Q Value layer QValue = tf.matmul(h_fc1,W_fc2) + b_fc2 return stateInput,QValue,W_conv1,b_conv1,W_conv2,b_conv2,W_conv3,b_conv3,W_fc1,b_fc1,W_fc2,b_fc2 def copyTargetQNetwork(self): self.session.run(self.copyTargetQNetworkOperation) def createTrainingMethod(self): self.actionInput = tf.placeholder("float",[None,self.actions]) self.yInput = tf.placeholder("float", [None]) Q_Action = tf.reduce_sum(tf.mul(self.QValue, self.actionInput), reduction_indices = 1) self.cost = tf.reduce_mean(tf.square(self.yInput - Q_Action)) self.trainStep = tf.train.AdamOptimizer(1e-6).minimize(self.cost) def trainQNetwork(self): # Step 1: obtain random minibatch from replay memory minibatch = random.sample(self.replayMemory,BATCH_SIZE) state_batch = [data[0] for data in minibatch] action_batch = [data[1] for data in minibatch] reward_batch = [data[2] for data in minibatch] nextState_batch = [data[3] for data in minibatch] # Step 2: calculate y y_batch = [] QValue_batch = self.QValueT.eval(feed_dict={self.stateInputT:nextState_batch}) for i in range(0,BATCH_SIZE): terminal = minibatch[i][4] if terminal: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + GAMMA * np.max(QValue_batch[i])) self.trainStep.run(feed_dict={ self.yInput : y_batch, self.actionInput : action_batch, self.stateInput : state_batch }) # save network every 100000 iteration if self.timeStep % 10000 == 0: self.saver.save(self.session, 'saved_networks/' + 'network' + '-dqn', global_step = self.timeStep) if self.timeStep % UPDATE_TIME == 0: self.copyTargetQNetwork() def setPerception(self,nextObservation,action,reward,terminal): #newState = np.append(nextObservation,self.currentState[:,:,1:],axis = 2) newState = np.append(self.currentState[:,:,1:],nextObservation,axis = 2) self.replayMemory.append((self.currentState,action,reward,newState,terminal)) if len(self.replayMemory) > REPLAY_MEMORY: self.replayMemory.popleft() if self.timeStep > OBSERVE: # Train the network self.trainQNetwork() # print info state = "" if self.timeStep <= OBSERVE: state = "observe" elif self.timeStep > OBSERVE and self.timeStep <= OBSERVE + EXPLORE: state = "explore" else: state = "train" print "TIMESTEP", self.timeStep, "/ STATE", state, \\ "/ EPSILON", self.epsilon self.currentState = newState self.timeStep += 1 def getAction(self): QValue = self.QValue.eval(feed_dict= {self.stateInput:[self.currentState]})[0] action = np.zeros(self.actions) action_index = 0 if self.timeStep % FRAME_PER_ACTION == 0: if random.random() <= self.epsilon: action_index = random.randrange(self.actions) action[action_index] = 1 else: action_index = np.argmax(QValue) action[action_index] = 1 else: action[0] = 1 # do nothing # change episilon if self.epsilon > FINAL_EPSILON and self.timeStep > OBSERVE: self.epsilon -= (INITIAL_EPSILON - FINAL_EPSILON)/EXPLORE return action def setInitState(self,observation): self.currentState = np.stack((observation, observation, observation, observation), axis = 2) def weight_variable(self,shape): initial = tf.truncated_normal(shape, stddev = 0.01) return tf.Variable(initial) def bias_variable(self,shape): initial = tf.constant(0.01, shape = shape) return tf.Variable(initial) def conv2d(self,x, W, stride): return tf.nn.conv2d(x, W, strides = [1, stride, stride, 1], padding = "SAME") def max_pool_2x2(self,x): return tf.nn.max_pool(x, ksize = [1, 2, 2, 1], strides = [1, 2, 2, 1], padding = "SAME") 下面是使用上面的DQN让agent玩Gym的Breakout-v0游戏。 1234567891011121314151617181920212223242526272829303132333435363738394041424344# -------------------------# Project: Deep Q-Learning on Breakout-v0# Author: Houjiang Chen# Date: 2017.4.25# -------------------------import cv2import gymfrom BrainDQN_Nature import BrainDQNimport numpy as np# preprocess raw image to 80*80 gray imagedef preprocess(observation): observation = cv2.cvtColor(cv2.resize(observation, (80, 80)), cv2.COLOR_BGR2GRAY) #ret, observation = cv2.threshold(observation, 1, 255, cv2.THRESH_BINARY) return np.reshape(observation, (80, 80, 1))def play(): env = gym.make('Breakout-v0') actions = env.action_space.n # init BrainDQN brain = BrainDQN(actions) while 1: state = env.reset() state = cv2.cvtColor(cv2.resize(state, (80, 80)), cv2.COLOR_BGR2GRAY) #ret, state = cv2.threshold(state, 1, 255, cv2.THRESH_BINARY) brain.setInitState(state) while 1: action = brain.getAction() state, reward, terminal, _ = env.step(np.argmax(action)) env.render() if terminal: break state = preprocess(state) brain.setPerception(state, action, reward, terminal)def main(): play()if __name__ == '__main__': main() 参考资料 1、Reinforcement Learning: An Introduction, Richard S. Sutton and Andrew G. Barto,2012 2、Playing Atari with Deep Reinforcement Learning,DeepMind Technologies,Arxiv 2013.12 3、Human-level control through deep reinforcement learning,DeepMind Technologies,Nature 2015.02 4、DeepMind官网 https://deepmind.com/blog/deep-reinforcement-learning 5、https://www.nervanasys.com/demystifying-deep-reinforcement-learning 6、http://www.cnblogs.com/jinxulin/p/3511298.html 7、Introduction to Reinforcement Learning,David Silver","categories":[{"name":"reinforcement learning","slug":"reinforcement-learning","permalink":"https://hjchen2.github.io/categories/reinforcement-learning/"}],"tags":[{"name":"reinforcement learning","slug":"reinforcement-learning","permalink":"https://hjchen2.github.io/tags/reinforcement-learning/"},{"name":"machine learning","slug":"machine-learning","permalink":"https://hjchen2.github.io/tags/machine-learning/"}]},{"title":"值函数的贝尔曼公式推导","slug":"值函数的贝尔曼公式推导","date":"2017-04-10T04:31:08.000Z","updated":"2023-01-03T14:06:55.430Z","comments":true,"path":"2017/04/10/值函数的贝尔曼公式推导/","link":"","permalink":"https://hjchen2.github.io/2017/04/10/%E5%80%BC%E5%87%BD%E6%95%B0%E7%9A%84%E8%B4%9D%E5%B0%94%E6%9B%BC%E5%85%AC%E5%BC%8F%E6%8E%A8%E5%AF%BC/","excerpt":"下面的推导过程中第2步和第5步两次用到重期望公式: \\(\\bf{EX}=\\bf{E\\left(E\\left[X\\mid Y\\right]\\right)}\\)。","text":"下面的推导过程中第2步和第5步两次用到重期望公式: \\(\\bf{EX}=\\bf{E\\left(E\\left[X\\mid Y\\right]\\right)}\\)。 \\[\\begin{split} \\upsilon_{\\pi}(s)&={\\bf{E_{\\pi}}}\\left[G_{t}\\mid{S_{t}=s}\\right] \\\\ &={\\bf{E_{\\pi}}}\\left({\\bf{E_{\\pi}}}\\left[G_t\\mid S_t=s,A_t\\right]\\right) \\\\ &={\\bf{E_{\\pi}}}\\left[\\sum_a\\pi(a|s)G_t\\mid S_t=s,A_t=a\\right] \\\\ &=\\sum_a\\pi(a|s){\\bf{E_{\\pi}}}\\left[G_t\\mid S_t=s,A_t=a\\right] \\\\ &=\\sum_a\\pi(a|s){\\bf{E_{\\pi}}}\\left({\\bf{E_{\\pi}}}\\left[G_t\\mid S_t=s,A_t=a,S_{t+1}\\right]\\right) \\\\ &=\\sum_a\\pi(a|s){\\bf{E_{\\pi}}}\\left[\\sum_{s^{'}}p(s^{'}\\mid s,a)G_t\\mid S_t=s,A_t=a,S_{t+1}=s^{'}\\right] \\\\ &=\\sum_a\\pi(a|s)\\sum_{s^{'}}p(s^{'}\\mid s,a){\\bf{E_{\\pi}}}\\left[G_t\\mid S_t=s,A_t=a,S_{t+1}=s^{'}\\right] \\\\ &=\\sum_{a}\\pi(a\\mid{s})\\sum_{s^{'}}p(s^{'}\\mid s,a){\\bf E}_{\\pi}\\left[R_{t+1}+\\gamma\\sum_{k=0}^{\\infty}\\gamma^{k}R_{t+k+2}\\mid{S_{t}=s,A_{t}=a,S_{t+1}=s^{'}}\\right] \\\\ &=\\sum_{a}\\pi(a\\mid{s})\\sum_{s^{'}}p(s^{'}\\mid{s,a})\\left[r(s,a,s^{'})+\\gamma{\\bf E}_{\\pi}\\left[\\sum_{k=0}^{\\infty}\\gamma^{k}R_{t+k+2}\\mid{S_{t+1}=s^{'}}\\right]\\right] \\\\ &=\\sum_{a}\\pi(a\\mid{s})\\sum_{s^{'}}p(s^{'}\\mid{s,a})\\left[r(s,a,s^{'})+\\gamma\\upsilon_{\\pi}(s^{'})\\right] \\end{split}\\]","categories":[{"name":"reinforcement learning","slug":"reinforcement-learning","permalink":"https://hjchen2.github.io/categories/reinforcement-learning/"}],"tags":[{"name":"reinforcement learning","slug":"reinforcement-learning","permalink":"https://hjchen2.github.io/tags/reinforcement-learning/"},{"name":"machine learning,贝尔曼公式推导","slug":"machine-learning,贝尔曼公式推导","permalink":"https://hjchen2.github.io/tags/machine-learning%EF%BC%8C%E8%B4%9D%E5%B0%94%E6%9B%BC%E5%85%AC%E5%BC%8F%E6%8E%A8%E5%AF%BC/"}]},{"title":"强化学习(一)","slug":"强化学习(一)","date":"2017-03-27T04:31:08.000Z","updated":"2023-02-07T02:39:23.375Z","comments":true,"path":"2017/03/27/强化学习(一)/","link":"","permalink":"https://hjchen2.github.io/2017/03/27/%E5%BC%BA%E5%8C%96%E5%AD%A6%E4%B9%A0%EF%BC%88%E4%B8%80%EF%BC%89/","excerpt":"前言 近几年,由于DeepMind成功地将强化学习(reinforcement learning)运用在AlphaGo上,机器首次在复杂任务上取得了超过人类的表现,使得强化学习成为目前机器学习研究的前沿方向之一。强化学习由来已久,Sutton等在1979年就已经开始研究强化学习,1998年出版了强化学习介绍一书,并于2012年发布第二版,本文前几部分内容主要参考该书。","text":"前言 近几年,由于DeepMind成功地将强化学习(reinforcement learning)运用在AlphaGo上,机器首次在复杂任务上取得了超过人类的表现,使得强化学习成为目前机器学习研究的前沿方向之一。强化学习由来已久,Sutton等在1979年就已经开始研究强化学习,1998年出版了强化学习介绍一书,并于2012年发布第二版,本文前几部分内容主要参考该书。 强化学习最早主要用于智能控制领域,比如机器人控制、电梯调度、电信通讯等,如今已经在自动驾驶、NLP、内容推荐[4]和语音交互领域都有相关的应用。2013年底DeepMind发表文章Playing Atari with Deep Reinforcement Learning,首次成功地将深度学习运用到强化学习任务上,通过无监督学习实现从纯图像输入来玩Atari 2600游戏的效果。而后DeepMind逐渐改进算法,使得DQN在Atari几乎一半的游戏中超过人类水平,以至2016年AlphaGo和无人车的出现,人们惊奇地发现人工智能即将颠覆我们的生活,甚至有人评论说传统的深度学习已经可以很好地感知理解了,强化学习可以利用这些感知生成策略,因而可以创造更高的机器智能。 下面是DeepMind使用DQN让机器学习玩Atari 2600游戏的视频。 什么是强化学习 Reinforcement learning is learning what to do—how to map situations to actions—so as to maximize a numerical reward signal[1]. 强化学习研究的是智能体agent与环境之间交互的任务,也就是让agent像人类一样通过试错,不断地学习在不同的环境下做出最优的动作,而不是有监督地直接告诉agent在什么环境下应该做出什么动作。在这里我们需要引入回报(reward)这个概念,回报是执行一个动作或一系列动作后得到的奖励,比如在游戏超级玛丽中,向上跳可以获得一个金币,也就是回报值为1,而不跳时回报就是0。回报又分为立即回报和长期回报,立即回报指的是执行当前动作后能立刻获得的奖励,但很多时候我们执行一个动作后并不能立即得到回报,而是在游戏结束时才能返回一个回报值,这就是长期回报。强化学习唯一的准则就是学习通过一序列的最优动作,获得最大的长期回报。比较有挑战性的是,任一状态下做出的动作不仅影响当前状态的立即回报,而且也会影响到下一个状态,因此也就会影响整个执行过程的回报。 因此,强化学习和监督学习的区别主要有以下两点[6]: 强化学习是试错学习(Trail-and-error),由于没有直接的指导信息,智能体要以不断与环境进行交互,通过试错的方式来获得最佳策略。 延迟回报,强化学习的指导信息很少,而且往往是在事后(最后一个状态)才给出的,这就导致了一个问题,就是获得正回报或者负回报以后,如何将回报分配给前面的状态。 问题描述与MDP 前面已经提到强化学习是尝试并发现回报最大动作的过程,下面就具体来描述一下这个过程。首先考虑一个问题,一个之前完全没有接触过国际象棋的小白怎样和一个专业棋手对弈。刚开始小白对棋面并没有任何概念,只能随机下,但假设双方每一轮下完后都会得到立即回报,比如吃子回报为1,被吃回报为-1,其他回报为0。可以想象一开始小白会输得很惨,但如果小白很聪明,随着不断地尝试小白不仅理解了下棋的规则,并且知道在什么棋面下做出什么动作可以吃更多的棋子。在这里我们将小白作为我们的智能体agent,棋面就是状态,下棋就是agent根据当前状态做出的动作,每个动作执行完后都会引起状态改变,如果状态的改变只与前一个状态和当前的动作有关,而与之前的状态和动作无关(即满足马尔可夫性),那么整个过程可以用马尔可夫决策过程(Markov Decision Processes)来描述,而Sutton在书中直接将满足马尔可夫性的强化学习任务定义为马尔可夫决策过程,并将状态和动作都是有限空间的MDP定义为有限马尔可夫决策过程(finite MDP)。 下面引入一些定义[1]:马尔可夫决策过程是一个agent与环境交互的过程,因此有一个离散的时间序列,\\(t=0,1,2,3,...\\),在每一个时刻\\(t\\),agent都会接收一个用来表示环境的状态\\(S_{t}\\in\\bf{S}\\),其中\\(\\bf{S}\\)表示所有可能状态的集合,并且在状态的基础上选择一个动作\\(A_{t}\\in{\\bf{A}}(S_{t})\\),其中\\({\\bf{A}}(S_{t})\\)表示在状态\\(S_{t}\\)时所有可能采取的动作的集合,在\\(t\\)时刻agent采取一个动作后都会收到一个回报值\\(R_{t+1}\\in\\bf{R}\\),然后接收一个新状态\\(S_{t+1}\\)。下图为整个过程的示意图。 在任意时刻和状态下,agent都可以选择一个动作,选择的依据就是我们说的策略—即状态到动作的映射\\(\\pi(a\\mid{s})\\),而一个使得在任意时刻和状态下的长期回报都是最大的策略是我们最终需要得到的。所谓长期回报我们可以用每个时刻的立即回报来表示: \\[G_{t}=R_{t+1}+R_{t+2}+R_{t+3}+...=\\sum_{k=t+1}^{\\infty}R_{k}\\tag{1.1}\\] 但实际上我们一般会用下面更通用的公式来代替: \\[G_{t}=R_{t+1}+\\gamma{R_{t+2}}+\\gamma^2{R_{t+3}}+...+\\gamma^{T-t-1}{R_{T}}=\\sum_{k=0}^{T-t-1}\\gamma^{k}R_{t+k+1}\\tag{1.2}\\] 其中\\(\\gamma\\in[0,1]\\)称为回报折扣因子,表明了未来的回报相对于当前回报的重要程度。\\(\\gamma=0\\)时,相当于只考虑立即回报不考虑长期回报,\\(\\gamma=1\\)时,将长期回报和立即回报看得同等重要。\\(T\\in[1,\\infty]\\)表示完成一次实验过程的总步数,\\(T=\\infty\\)和\\(\\gamma=1\\)不能同时满足,否则长期回报将无法收敛。特别地,我们将一次有限步数的实验称作一个单独的episodes,也就是经过有限步数后最终会接收一个终止状态,这一类的任务也叫做episodic tasks。下面讨论的强化学习任务都是有限MDP的episodic tasks。 马尔可夫决策过程 一个有限马尔可夫决策过程由一个四元组构成 \\(M=({\\bf{S}}, {\\bf{A}}, {\\bf{P}}, {\\bf{R}})\\)[6]。如上所述,\\(\\bf{S}\\)表示状态集空间,\\({\\bf{A}}\\)表示动作集空间,\\({\\bf{P}}\\)表示状态转移概率矩阵,\\({\\bf{R}}\\)表示期望回报值。 在MDP中给定任何一个状态\\(s\\in\\bf{S}\\)和动作\\(a\\in\\bf{A}\\),都会以某个概率转移到下一个状态\\(s^{'}\\),这个概率为\\(p(s^{'}\\mid s, a)={\\bf{Pr}}\\{S_{t+1}=s^{'}\\mid S_{t}=s, A_{t}=a\\}\\in\\bf{P}\\),并获得下一个回报的期望值为\\(r(s,a,s^{'})={\\bf{E}}\\left[R_{t+1}\\mid{S_{t}=s,A_{t}=a,S_{t+1}=s^{'}}\\right]\\in\\bf{R}\\)。 值函数及贝尔曼公式 增强学习的最终结果是找到一个环境到动作的映射—即策略\\(\\pi(a\\mid{s})\\)。如果一个策略只考虑立即回报,那么很可能就会掉入眼前陷阱。比如说有一个岔路口,往左回报是100,往右回报是10,如果策略只考虑立即回报,那肯定是往左,但往左走的下一次回报只有10,而往右走的下一次回报有200,可以看到这个策略并不是最优的策略,此外增强学习又往往有具有延迟回报的特点,在很多情况下的动作并不会产生立即回报,但这一系列动作的累积效果又的确会导致后续回报的产生,因此立即回报并不能说明策略的好坏。在几乎所有的强化学习理论中都会定义值函数来表示给定策略下期望的未来回报,并将值函数作为评估学习效果的指标。 值函数有多种定义,目前常见的是将值函数直接定义为未来回报的期望: \\[ \\upsilon_{\\pi}(s)={\\bf{E_{\\pi}}}\\left[G_{t}\\mid{S_{t}=s}\\right]={\\bf{E_{\\pi}}}\\left[\\sum_{k=0}^{\\infty}\\gamma^{k}R_{t+k+1}\\mid{S_{t}=s}\\right]\\tag{2.1} \\] 上面表示的是在某个策略\\(\\pi\\)下,当环境处于状态\\(s\\)时未来回报的期望,因此又叫做状态值函数(state-value function for policy),只跟当前状态有关。同样,我们也可以定义动作值函数(action-value function for policy),如下: \\[ \\begin{split}q_{\\pi}(s,a)&={\\bf{E_{\\pi}}}\\left[G_{t}\\mid{S_{t}=s,A_{t}=a}\\right] \\\\ &={\\bf{E_{\\pi}}}\\left[\\sum_{k=0}^{\\infty}\\gamma^{k}R_{t+k+1}\\mid{S_{t}=s,A_{t}=a}\\right]\\end{split}\\tag{2.2} \\] 动作值函数表示在某个策略\\(\\pi\\)下,当环境处于状态\\(s\\)时采取动作\\(a\\)的未来回报的期望。可以看到动作值函数与状态值函数唯一的不同是动作值函数不仅指定了一个初始状态,而且也指定了初始动作,而状态值函数的初始动作是根据策略产生的。由于在MDP中,给定状态\\(s\\),agent根据策略选择动作\\(a\\),下个时刻将以概率\\(p(s^{'}\\mid{s,a})\\)转移到状态\\(s^{'}\\),因此值函数又可以改写成如下形式: \\[ \\begin{split}\\upsilon_{\\pi}(s)&={\\bf{E_{\\pi}}}\\left[G_{t}\\mid{S_{t}=s}\\right] \\\\ &={\\bf{E_{\\pi}}}\\left[\\sum_{k=0}^{\\infty}\\gamma^{k}R_{t+k+1}\\mid{S_{t}=s}\\right] \\\\ &={\\bf{E_{\\pi}}}\\left[R_{t+1}+\\gamma\\sum_{k=0}^{\\infty}\\gamma^{k}R_{t+k+2}\\mid{S_{t}=s}\\right] \\\\ &=\\sum_{a}\\pi(a\\mid{s})\\cdot{\\bf E}_{\\pi}\\left[R_{t+1}+\\gamma\\sum_{k=0}^{\\infty}\\gamma^{k}R_{t+k+2}\\mid{S_{t}=s,A_{t}}\\right] \\\\ &=\\sum_{a}\\pi(a\\mid{s})\\sum_{s^{'}}p(s^{'}\\mid{s,a})\\left[r(s,a,s^{'})+\\gamma{\\bf E}_{\\pi}\\left[\\sum_{k=0}^{\\infty}\\gamma^{k}R_{t+k+2}\\mid{S_{t+1}=s^{'}}\\right]\\right] \\\\ &=\\sum_{a}\\pi(a\\mid{s})\\sum_{s^{'}}p(s^{'}\\mid{s,a})\\left[r(s,a,s^{'})+\\gamma\\upsilon_{\\pi}(s^{'})\\right]\\end{split} \\tag{2.3} \\] 也就是说在策略\\(\\pi\\)下当前状态的值函数可以通过下一个状态的值函数来迭代求解,这个公式被称为\\(\\upsilon_{\\pi}\\)的贝尔曼公式(Bellman equation for \\(\\upsilon_{\\pi}\\))。 同样,动作值函数也可以写成相似的形式: \\[ \\begin{split}q_{\\pi}(s,a)&={\\bf{E_{\\pi}}}\\left[G_{t}\\mid{S_{t}=s,A_{t}=a}\\right] \\\\ &={\\bf{E_{\\pi}}}\\left[R_{t+1}+\\gamma\\sum_{k=0}^{\\infty}\\gamma^{k}R_{t+k+2}\\mid{S_{t}=s,A_{t}=a}\\right] \\\\ &=\\sum_{s^{'}}p(s^{'}\\mid{s,a})\\left[r(s,a,s^{'})+\\gamma\\upsilon_{\\pi}(s^{'})\\right]\\end{split}\\tag{2.4} \\] \\(\\upsilon_{\\pi}(s)\\)也可以用\\(q_{\\pi}(s,a)\\)来表示: \\[\\upsilon_{\\pi}(s)=\\sum_{a}\\pi(a\\mid{s})q_{\\pi}(s,a)\\tag{2.5}\\] 下面是迭代计算\\(\\upsilon_{\\pi}(s)\\)和\\(q_{\\pi}(s,a)\\)的图解[1],可以与上述公式对照理解。 最优值函数及贝尔曼最优公式 上面所说的值函数都是未来回报的期望值,而我们需要得到的最优策略必然是使得任意时刻未来回报的期望值都是最大的,也就是说我们的优化目标可以表示为: \\[\\pi_{*}=\\mathop{\\arg\\max}_{\\mathbf{\\pi}}\\upsilon_{\\pi}(s)\\tag{2.6}\\] 当然最优策略可能不止一个,但这些最优策略都有一个共同的特点,就是它们共享同样的状态值函数,这个状态值函数叫做最优状态值函数(optimal state-value function),用\\(\\upsilon_{*}\\)来表示。对于所有的\\(s\\in\\bf{S}\\), \\[\\upsilon_{*}(s)=\\max_{\\mathbf{\\pi}}\\upsilon_{\\pi}(s)\\tag{2.7}\\] 最优策略同样也共享相同的动作值函数(optimal action-value function),用\\(q_{*}\\)来表示。对于所有的\\(s\\in\\bf{S}\\),\\(a\\in{\\bf{A}}(s)\\), \\[q_{*}(s,a)=\\max_{\\mathbf{\\pi}}q_{\\pi}(s,a)\\tag{2.8}\\] 回顾一下上面动作值函数的改写公式(2.4),\\(q_{\\pi}(s,a)=\\sum_{s^{'}}p(s^{'}\\mid{s,a})\\left[r(s,a,s^{'})+\\gamma\\upsilon_{\\pi}(s^{'})\\right]\\),由于动作值函数表示的是给定初始动作,后面的动作遵循策略\\(\\pi\\),因此最优动作值函数后面的动作应当遵循最优策略\\(\\pi_{*}\\),不难得到下面的公式。 \\[q_{*}(s,a)=\\sum_{s^{'}}p(s^{'}\\mid{s,a})\\left[r(s,a,s^{'})+\\gamma\\upsilon_{*}(s^{'})\\right]\\tag{2.9}\\] 至此,最优值函数的形式已经给出了,现在我们继续回顾一下公式(2.5)的意义,\\(\\upsilon_{\\pi}(s)\\)的值是\\(q_{\\pi}(s,a)\\)的期望,那么必然存在\\(\\upsilon_{\\pi}(s)\\leq \\max q_{\\pi}(s,a)\\)。但对于最优策略来说, \\[ \\begin{split}\\upsilon_{*}(s)&=\\max_{\\mathbf{a}} q_{*}(s,a) \\\\ &=\\max_{\\mathbf{a}}\\sum_{s^{'}}p(s^{'}\\mid{s,a})\\left[r(s,a,s^{'})+\\gamma\\upsilon_{*}(s^{'})\\right] \\end{split}\\tag{2.10} \\] \\[ q_{*}(s,a)=\\sum_{s^{'}}p(s^{'}\\mid{s,a})\\left[r(s,a,s^{'})+\\gamma\\max_{\\mathbf{a^{'}}}q_{*}(s^{'},a^{'})\\right]\\tag{2.11} \\] 与状态值函数的贝尔曼公式一样,最优状态值函数和最优动作值函数也可以表示成递归的形式,因此公式(2.10)和公式(2.11)又分别叫做状态值函数和动作值函数的贝尔曼最优公式(Bellman optimality equation)。因为没有\\(\\pi(a\\mid{s})\\),不需要根据策略生成动作,因此贝尔曼最优公式完全独立于策略,但如果我们已知\\(\\upsilon_{*}\\)或\\(q_{*}\\),都可以很容易地得到最优策略。 如果我们已知\\(\\upsilon_{*}\\),而且在每一步都有多个动作可以选择,可以想到最优策略的\\(\\upsilon_{*}(s)\\)必然是满足贝尔曼最优公式的,因此至少有一个动作会满足公式中的最大化条件。任何一个采用上述动作并能够以非零概率转移到下一个状态的策略都是最优策略。我们可以把当前动作的选择看成是一个单步搜索(one-step search)的问题,在某个状态下单步搜索结果最大的动作即最优动作,而每个状态下都采取最优动作的策略即最优策略。如果我们已知\\(q_{*}\\),那么只需要在每一步都选择使得\\(q_{*}(s,a)\\)最大的动作,就可以得到一个最优策略。 贝尔曼公式与贝尔曼最优公式是MDP求解的基础,下面主要介绍几种MDP求解的方法。 动态规划方法 动态规划(dynamic programming)指的是能够用来解决给定环境模型,计算最优策略的算法总称。典型的动态规划算法存在两个问题,一是需要依赖一个非常好的环境状态转移模型,二是计算的开销非常大,因此在增强学习中几乎不会直接用动态规划求解MDP,但动态规划理论还是非常重要的,因为后面的一些算法都是在动态规划的基础上,摆脱模型依赖并尽可能地减少计算量。 策略估计 首先,我们考虑一下如果已知策略\\(\\pi\\),如何来计算\\(\\upsilon_{\\pi}\\)。这个问题被称作DP迭代中的策略估计(policy evaluation)。 先举一个例子,一个岔路口有向左和向右两个方向,向左回报为10,向右回报为100,我们没有任何先验知识,但我们需要估计站在路口的值函数,也就是估计当前状态的值函数,该如何来估计呢?首先我们将值函数初始化为0,然后进行大量的尝试,每次都以0.5的概率选择方向左,并获得回报10,以0.5的概率选择方向右,获得回报100。那么只要能将这两个方向都至少遍历一遍,就可以得到该状态的值函数\\(\\upsilon_{随机策略}=\\frac{1}{N}\\sum_{i=0}^{N}{0.5\\cdot R_{i}}\\),其中\\(N\\)为实验的总次数。 同样,我们也是采用相似的方法迭代来进行策略估计的。首先将所有的\\(\\upsilon_{\\pi}(s)\\)都初始化为0(或者任意值,但终止状态必须为0),然后采用如下公式更新所有状态\\(s\\)的值函数。 \\[ \\begin{split}\\upsilon_{k+1}(s) &={\\bf{E}}_{\\pi}\\left[R_{t+1}+\\gamma \\upsilon_{k}(S_{t+1})\\mid S_{t}=s \\right] \\\\ &=\\sum_{a}\\pi(a\\mid{s})\\sum_{s^{'}}p(s^{'}\\mid{s,a})\\left[r(s,a,s^{'})+\\gamma\\upsilon_{k}(s^{'})\\right] \\end{split}\\tag{3.1} \\] 其中\\(\\upsilon_{k+1}(s)\\)表示在当前策略下第\\(k+1\\)次迭代状态\\(s\\)的值函数,\\(\\upsilon_{k}(s^{'})\\)表示在当前策略下第\\(k\\)次迭代状态\\(s^{'}\\)的值函数,该公式就是用上一次迭代计算得到的值函数来更新本次迭代的值函数。在具体操作时,又有两种更新方法[6], 将第\\(k\\)次迭代计算得到的所有状态值函数\\(\\left[\\upsilon_{k}(s_{1}),\\upsilon_{k}(s_{2}),\\upsilon_{k}(s_{3}),...\\right]\\)保存在一个数组中,第\\(k+1\\)次迭代的\\(\\upsilon_{k+1}(s)\\)使用第\\(k\\)次的\\(\\upsilon_{k}(s^{'})\\)进行更新,更新后的值保存在另一个数组中。 仅用一个数组来保存各状态的值函数,每次更新后就将原来的值覆盖。这样在第\\(k+1\\)次迭代时\\(\\upsilon_{k+1}(s)\\)就有可能使用的是第\\(k+1\\)次更新后的\\(\\upsilon_{k+1}(s^{'})\\),这样可以及时地利用更新的值函数,收敛更快。 下面为整个策略估计的算法过程: 策略改进 策略估计是为了计算当前策略下各状态的值函数,那得到值函数又有什么用呢?首先我们可以用来比较两个策略的好坏,如果状态值函数是已知的,那么就可以根据公式(2.4)计算动作值函数,如果一个策略\\(\\pi\\)的所有动作值函数都大于另一个策略\\(\\pi^{'}\\),那么可以认为策略\\(\\pi\\)比策略\\(\\pi^{'}\\)更好。其次,最主要的用处是可以用来进行策略改进(policy improvement)。 仍然是上面岔路口的例子,但是假设无论向左还是向右,下一个路口都是唯一且相同的。起初由于没有任何先验知识,因此采用了一个随机策略,然后我们可以计算得到随机策略下的状态值函数,那么我们就可以进行策略改进了。具体的做法就是前面提到的单步搜索,向左时当前动作的回报为10,因此单步搜索的结果为10+\\(\\gamma\\upsilon\\),\\(\\upsilon\\)为下一个路口的值函数,而向右为100+\\(\\gamma\\upsilon\\),因此策略会更新为向右,而不再是随机了,显然策略被改进了。同时我们注意到,单步搜索计算的值正是动作值函数。 根据上面的例子,我们可以总结一下策略改进的方法:遍历所有的状态和所有可能的动作,采用贪婪算法进行策略的更新,即对所有\\(s\\in\\bf S\\), \\[ \\begin{split}\\pi^{'}(s)&=\\arg\\max_{\\mathbf{a}}q_{\\pi}(s,a)\\\\ &=\\arg\\max_{\\mathbf{a}}\\sum_{s^{'}}p(s^{'}\\mid s,a)\\left[r(s,a,s^{'})+\\gamma\\upsilon_{\\pi}(s^{'})\\right]\\end{split}\\tag{3.2} \\] 现在我们已经知道如何计算当前策略的状态值函数,也知道可以根据动作值函数来更新策略,那下面就来讲讲如何从零开始求解最优策略。 策略迭代 一旦策略\\(\\pi\\)通过策略改进得到一个更好的策略\\(\\pi^{'}\\),那么我们就可以通过策略估计算法,计算策略\\(\\pi^{'}\\)的状态值函数,并用公式(3.2)进行策略改进得到一个比策略\\(\\pi^{'}\\)更好的策略\\(\\pi^{''}\\)。如下图所示,经过无数次的策略估计和策略改进后,我们终将会收敛于最优策略\\(\\pi_{*}\\)。这种通过不断迭代地去改进策略的方法叫做策略迭代(policy iteration)。 下面为整个策略迭代的算法过程: 值迭代 策略迭代算法需要不断地进行策略估计和策略改进,每次策略估计和改进都需要遍历一次所有的状态和动作,因此算法的计算量非常大,效率非常低。同时可以看到策略迭代的依据是贝尔曼公式,而如果直接利用贝尔曼最优公式会不会加速求解过程呢?事实上是可以的,下面的值迭代(value iteration)算法就是利用贝尔曼最优公式来提高求解效率的一种算法。 我们还是需要先迭代估计状态值函数,但不必每次迭代都进行策略改进。根据贝尔曼最优公式,可以直接用上一次迭代的最大动作值函数对当前迭代的状态值函数进行更新,如下所示: \\[ \\begin{split}\\upsilon_{k+1}(s)&=\\max_{\\mathbf{a}} q_{k}(s,a) \\\\ &=\\max_{\\mathbf{a}}\\sum_{s^{'}}p(s^{'}\\mid{s,a})\\left[r(s,a,s^{'})+\\gamma\\upsilon_{k}(s^{'})\\right] \\end{split}\\tag{3.3} \\] 值迭代算法的好处就是省去了每次迭代时的策略改进过程,并且由于每次迭代得到的\\(\\upsilon_{k+1}(s)\\)都要\\(\\geq\\)策略迭代得到的\\(\\upsilon_{k+1}(s)\\),也就是说相同迭代次数下,策略迭代得到的策略肯定没有值迭代得到的策略好,因此能大大加快算法收敛。直到值函数收敛到最优值函数后,再通过最优值函数来计算得到最优策略,下面是值迭代算法的完整过程: 一般来说值迭代和策略迭代都需要经过无数次迭代才能精确收敛到最优策略, 而实践中我们往往会设定一个阈值\\(\\Delta\\)来作为迭代中止条件,即当所有的\\(\\upsilon_{\\pi}(s)\\)变化量小于\\(\\Delta\\)时,我们就近似的认为获得了最优策略。值迭代和策略迭代都可以用来求解最优策略,但是都需要依赖一个现有的环境模型,而对环境进行精确建模往往是非常困难的,所以导致了动态规划方法在MDP求解时几乎不可用,当然如果状态转移是确定性的(\\(p(s^{'}\\mid s,a)=1\\)),那就另当别论了。 蒙特卡罗方法 下面我们要讲的是蒙特卡罗方法(Monte Carlo Methods)。与动态规划不同,蒙特卡罗方法不需要知道环境的完整模型,仅仅需要经验就可以获得最优策略,这些经验可以通过与环境在线或模拟交互的方式获得。在线交互显然是不需要任何环境的先验知识,模拟交互虽然需要知道环境状态的转移,但与动态规划不同的是这里不需要知道具体的转移概率。 蒙特卡罗方法也称统计模拟方法,基本思想是通过对大量的重复随机事件进行统计,估计随机事件的概率分布或期望。一个典型的例子是利用蒙特卡罗方法计算圆周率。假设我们知道圆的面积公式为\\(S=\\pi r^{2}\\),那计算圆周率的公式自然就是\\(\\pi = \\frac{S}{r^{2}}\\),因此如果我们知道圆面积和圆半径,那么就可以求到圆周率。那么如何计算一个圆的面积呢?给定一个圆,我们可以画出这个圆的外切正方形,那么这个外切正方形的面积为\\(S_{正方形}=4r^{2}\\),现在我们往正方形区域随机投点,并统计点落在圆内的概率\\(p\\),那么圆面积可以这么计算:\\(S_{圆}=p\\cdot S_{正方形}\\),因此\\(\\pi=4\\cdot p\\)。可以想到,如果投点次数越多,\\(p\\)估计越精确,\\(\\pi\\)的结果也就越接近真实值。 蒙特卡罗策略估计 我们现在来考虑一下如何利用蒙特卡罗方法估计给定策略下的状态值函数。与上面计算圆周率的例子稍有不同的是,现在我们估计的是未来回报的期望,而不是概率,但基本思想是一样的。很显然,如果要估计\\(\\upsilon_{\\pi}(s)\\),我们首先需要根据给定策略生成大量的经验数据,然后从中统计从状态\\(s\\)开始的未来回报的平均值,这个平均值就是我们估计的状态值函数。这种利用蒙特卡罗方法进行策略估计的算法又叫做蒙特卡罗策略估计(Monte Carlo Policy Evaluation)。 蒙特卡罗策略估计在具体实现时又分为first-visit MC methods和every-visit MC methods。由于在一个episode中,状态\\(s\\)可能会出现多次,first-visit MC methods就是只统计第一次到达该状态的未来回报,而every-visit MC methods是所有达到该状态的未来回报都会统计累加起来。下面我们举例说明first-visit MC methods的估计方法[6]。 现在我们假设有如下一些样本(下图每一行都是在当前策略下的一个独立的episode),紫色实心点为状态\\(s\\),取折扣因子γ=1,即直接计算累积回报。 第一个episode中到达过两次状态\\(s\\),我们只计算第一次的未来回报\\(R_{1}(s)=1-2+0+1-3+5=2\\)。假设我们已经用相同的方法计算得到\\(R_{2}(s)=1\\),\\(R_{3}(s)=-5\\),\\(R_{4}(s)=4\\)。那么当前策略下状态\\(s\\)的值函数 \\[\\upsilon_{\\pi}(s)={\\bf E}\\left[R(s)\\right]=\\frac{1}{N}\\sum_{i=1}^{N}\\left[R_{i}(s)\\right]=\\frac{1}{4}\\left(2+1-5+4\\right)=0.5\\] 同样,如果生成的episode数量越多,\\(\\upsilon_{\\pi}(s)\\)的估计就越接近真实值,下面是具体的算法流程: 注意这里使用大写的\\(V\\)表示状态值函数的估计,Sutton的理由是状态值函数一旦初始化,就会立即变成一个随机的值了,因为\\(G\\)会随着生成的episode不同而不断变化。可以认为每次\\(G\\)都为\\(\\upsilon_{\\pi}(s)\\)的一个独立同分布估计,当数据量非常大时\\(V(s)\\)将最终收敛于这个分布的均值。 动作值函数的蒙特卡罗估计 由于我们没有完整的环境状态转移模型,因此即使我们得到当前策略的值函数,根据公式(3.2)也无法进行策略改进。既然我们可以估计得到状态值函数,那么肯定也可以用相同的方法直接估计动作值函数,在这里叫做动作值函数的蒙特卡罗估计(Monte Carlo Estimation of Action Values)。 估计方法跟蒙特卡罗策略估计差不多,只不过我们需要找到所有的状态动作对(pair of state \\(s\\) and action \\(a\\)),然后统计每一个状态动作对的未来回报的平均值,即\\(q_{\\pi}(s,a)\\)的估计值。得到了\\(q_{\\pi}(s,a)\\),我们就可以根据公式(3.2)进行策略改进了。 蒙特卡罗控制 蒙特卡罗控制(Monte Carlo Control)首要的问题就是如何估计最优策略。跟之前动态规划一样,这里也可以采用策略迭代和策略改进交替进行的方式,经过大量的迭代后收敛到最优策略。但蒙特卡罗方法有一个最大的问题,即我们需要产生无数的episode才能保证收敛到最优结果。无数的episode和大量的迭代导致计算量巨大,效率非常低。Sutton在书[1]中提到两种解决方法,其中一种方法是采用episode-by-episode的方式进行优化。 episode-by-episode的思想与动态规划中值迭代的in-place版本非常相似。在动态规划的值迭代中,我们每次迭代都直接覆盖更新值函数,因此能及时地利用到更新后的值函数,从而能加快收敛。episode-by-episode则是先用当前策略生成一个episode,然后根据这个episode进行动作值函数的更新,同时更新策略,并利用更新后的策略继续生成后续的episode。 下面是exploring starts的蒙特卡罗控制(Monte Carlo ES,exploring starts指的是从一个随机的开始状态和动作生成一个episode)算法的完整过程: 至于为何要使用exploring starts,这与episode-by-episode在线生成episode的更新策略有关。还是上面的岔路口的例子,我们先随机指定一个策略,比如指定向左,那么使用该策略生成一个episode时必然也是向左,那么也就只能更新向左的动作值函数了,而无法更新向右的动作值函数。由于动作值函数是随机初始化的,如果向右的动作值函数初始值小于更新后的向左的动作值函数,那么下一次生成episode时仍然是向左,并且可以想象可能永远不会选择向右。但其实向右才是最优动作,因此上述更新的策略永远不可能是最优策略。但随机选择开始状态和动作,可以避免某些动作的值函数不会更新的问题,因此可以保证能获得最优策略。 当然也可以采用其他方法避免使用exploring starts,下面要介绍的on-policy方法和off-policy方法就是其中的两种方法。 On-Policy蒙特卡罗控制 前面的Monte Carlo ES算法使用exploring starts是为了保证所有可能的动作值函数都能得到更新,从而保证能获得最优策略。如果策略本身就可以在任何状态下都采取所有可能的动作,而不是贪婪地只选择动作值函数最大的那个,那问题不就迎刃而解了吗。下面要讨论策略是非确定性的,也就是对于所有的状态\\(s\\)和该状态下所有可能的动作\\(a\\)都有\\(\\pi(a\\mid s)>0\\),并且用\\(\\epsilon-soft\\)策略生成episode。由于我们评估和改进的策略与生成episode的策略是相同的,因此叫做on-policy方法。 在\\(\\epsilon-soft\\)策略中,大多数时候策略会选择动作值函数最大的动作(或者换句话说,以\\(1-\\epsilon\\)的概率选择动作值函数最大的动作,\\(\\epsilon\\)是一个非常小的正数),但也会以概率\\(\\epsilon\\)从其他动作中随机挑选一个动作,整体算法流程: Off-Policy蒙特卡罗控制 在off-policy方法中,生成episode的策略与评估和改进的策略并非同一个策略。其中生成episode的策略我们叫行为策略(behavior policy),而评估和改进的策略叫估计策略(estimation policy)。这种方法的好处是可以使行为策略是\\(\\epsilon-soft\\)策略,但估计策略是确定性的。下面只给出算法流程,具体推导请参考Sutton在书中的介绍[1]。 时间差分学习 时间差分学习(temporal-dierence (TD) learning)结合了动态规划和蒙特卡罗方法的优点,与蒙特卡罗方法一样不需要环境模型,与动态规划一样更新估计值时只依赖于下一个状态可用的估计值,而不需要等到策略自举出完整的episode。 TD预测 TD预测(TD prediction)又叫TD策略估计,就是从给定的一系列经验数据中估计出当前策略的状态值函数\\(\\upsilon_{\\pi}\\)。回顾一下蒙特卡罗控制,我们是先自举一个episode,然后根据历史episode和当前最新的episode计算从状态\\(s\\)开始未来回报的均值,作为当前状态值函数的更新值。对上面更新方式稍做修改,我们可以用一种滑动平均的方法来更新,即只用当前episode的未来回报与状态值函数的差值来更新。一个简单的every-visit MC方法的更新公式就如下所示: \\[V(S_{t})=(1-\\alpha)V(S_{t})+\\alpha G_{t}=V(S_{t})+\\alpha\\left[G_{t}-V(S_{t}) \\right]\\tag{4-1}\\] \\(V(S_{t})\\)表示第\\(t\\)个时刻为状态\\(S_{t}\\)的状态值函数,\\(G_{t}\\)表示从状态\\(S_{t}\\)开始到episode结束时的总回报,\\(\\alpha\\)是一个常数步长参数(梯度下降算法中叫学习率),这个公式叫做\\(constant-\\alpha\\) MC。在这个公式中,\\(G_{t}\\)是需要等到整个episode结束才能得到的,因此只有在自举完整的episode后才能进行更新。下面要说的TD算法就很好地解决了这个问题,只需要等到下一个时刻转移到下一个状态和获得回报值。下面是一种最简单的TD算法,叫做TD(0)。 \\[V(S_{t})=V(S_{t})+\\alpha\\left[R_{t+1}+\\gamma V(S_{t+1})-V(S_{t}) \\right]\\tag{4-2}\\] 我们这里只是用\\(R_{t+1}+\\gamma V(S_{t+1})\\)来估计\\(constant-\\alpha\\) MC中未来回报的真实值。与蒙特卡罗控制一样,TD(0)也能确保收敛到最优状态值函数,当然前提也是需要大量的经验数据。至于TD(0)与蒙特卡罗控制哪个算法收敛更快,这个问题并没有准确的答案,不过Sutton在书中指出,在一些随机任务上TD(0)比\\(constant-\\alpha\\) MC收敛更快。TD(0)算法在每个时刻都要进行一次更新,更高效的方法是在训练时使用batch updating的方式,即一个batch进行一次更新。 显然,TD learning相比MC有以下优点[7]: 由于TD预测使用差值进行更新,加上步进参数\\(\\alpha\\)的存在,TD learning的更新更平稳,方差更小。 TD learning可以用于在线训练,因为不需要等到整个episode结束才更新。 TD learning应用更广,可以用于非有限步数的情况。 但也存在一些缺点,比如TD learning对初始值比较敏感,以及收敛结果是有偏的。 TD(λ) 在介绍TD(λ)之前,我们先介绍一下n-Step TD预测。前面介绍的TD(0)算法在当前状态的基础上往后执行一步就可以进行更新,并且在更新时使用了贝尔曼公式对当前状态的未来回报进行估计,那我们是不是也可以往后执行n步之后再更新,这样用贝尔曼公式估计的未来回报是不是会更加精确呢?实际上,当n等于整个episode的总步数时,n-Step TD预测就完全成了MC估计了。 对于1-step来说,未来回报的值等于第一个回报值加上下一个状态值函数折扣后的值,用公式表示: \\[G_{t}^{(1)}=R_{t+1}+\\gamma V(S_{t+1})\\] 2-step比1-step多执行一步,其未来回报值为: \\[G_{t}^{(2)}=R_{t+1}+\\gamma R_{t+2}+\\gamma^{2} V(S_{t+2})\\] 那么n-step的未来回报值为: \\[G_{t}^{(n)}=R_{t+1}+\\gamma R_{t+2}+\\gamma^{2} V(S_{t+2})+...+\\gamma^{n}V(S_{t+n})\\] 在公式(4-1)中我们用\\(G_{t}^{(n)}\\)替代\\(G_{t}\\),最后n-Step TD预测的更新公式为: \\[V(S_{t})=V(S_{t})+\\alpha\\left[G_{t}^{(n)}-V(S_{t}) \\right]\\tag{4-3}\\] n-Step TD预测一定程度上可以使得估计的值函数更准确,因此收敛效果会更好,但更新时需要等待的步数增加了。下图是使用n-Step TD方法在random walk任务上的RMS error对比。 n-Step TD只使用了从当前状态开始执行n步未来回报的估计值\\(G_{t}^{(n)}\\),其实为了充分利用中间每个step的信息,也可以使用不同的n对应的\\(G_{t}^{(n)}\\)的平均值。比如可以把2-step和4-step的均值作为\\(G_{t}\\)的估计值, \\[G_{t}^{avg}=\\frac{1}{2}G_{t}^{(2)}+\\frac{1}{2}G_{t}^{(4)}\\] TD(λ)也可以理解为一种特殊的n-step平均算法,每个n-step的权重为\\((1-\\lambda)\\lambda^{(n-1)}\\),所有权重和仍然为1,因此有: \\[G_{t}^{(\\lambda)}=(1-\\lambda)\\sum_{n=1}^{\\infty}\\lambda^{n-1}G_{t}^{(n)}\\tag{4-4}\\] 公式(4-4)表示的是没有终止状态的情况,对于最终存在终止状态的episode任务或截断任务[注1]来讲,为了保证所有权重的和为1,最后一个n-step的权重被设置为\\(\\lambda^{T-t-1}\\),其中\\(T\\)为episode总步数。 \\[G_{t}^{(\\lambda)}=(1-\\lambda)\\sum_{n=1}^{T-t-1}\\lambda^{n-1}G_{t}^{(n)}+\\lambda^{T-t-1}G_{t}\\tag{4-5}\\] 当\\(\\lambda=1\\)时,这时TD(λ)就相当于MC,而当\\(\\lambda=0\\)时,TD(λ)就退化成了TD(0)。 Sarsa 接下来我们考虑一下如何使用TD预测进行策略改进。首先我们知道可以使用TD预测来估计状态值函数,并且可以使用公式(3-2)进行策略改进。但问题来了,公式(3-2)中的\\(p(s^{'}\\mid s,a)\\)是未知参数,无法直接进行策略改进。回顾一下蒙特卡洛控制方法,TD也可以直接对动作值函数\\(q_{\\pi}\\)进行估计。与\\(\\upsilon_{\\pi}\\)的更新公式一样,下面是\\(q_{\\pi}\\)的更新公式, \\[Q(S_t,A_t)=Q(S_t,A_t)+\\alpha[R_{t+1}+\\gamma Q(S_{t+1},A_{t+1})-Q(S_t,A_t)]\\tag{4-3}\\] 有了状态值函数,接下来就可以使用公式(3-2)进行策略改进了。在公式(4-3)中,每次非结束状态\\(S_t\\)转移到下一个状态时都进行一次值函数的更新,每次更新都只与\\((S_t,A_t,R_{t+1},S_{t+1},A_{t+1})\\)有关,因此叫做Sarsa算法。如果状态\\(S_{t+1}\\)为终止状态,则\\(Q(S_{t+1},A_{t+1})=0\\)。下面是Sarsa \\(\\epsilon-greedy\\)算法的完整过程,由于评估和改进时采用的策略与生成episode的策略是同一个策略,因此Sarsa算法是一种on-policy方法。 Sarsa的\\(Q\\)值更新公式与\\(TD(0)\\)一致,实际上也可以采用\\(TD(λ)\\)的形式进行\\(Q\\)值更新,这个改进算法就是Sarsa(λ)。关于Sarsa(λ)的具体介绍请参考《Reinforcement Learning: An Introduction》一书第七章。 Q-Learning 下面介绍的Q学习是一种off-policy方法,并被认为是强化学习算法最重要的突破之一。在Q-learning中,动作值函数的更新完全独立于生成episode的策略,使得学习到的\\(Q(S_t,A_t)\\)直接是最优动作值函数\\(q_{*}\\)的估计值。 \\[Q(S_t,A_t)=Q(S_t,A_t)+\\alpha[R_{t+1}+\\gamma \\mathop \\max_{a} Q(S_{t+1},a)-Q(S_t,A_t)]\\tag{4-4}\\] 公式(4-4)为Q-learning的单步更新公式,与Sarsa唯一的不同是:类似于动态规划中的值迭代算法,Q学习也是直接使用最优的\\(Q(S_{t+1}, A_{t+1})\\)进行更新,也就相当于策略只采用了最大\\(Q\\)值对应的动作。 Q-learning简化了算法分析和收敛性证明的难度,使得它的收敛性很早就得到了证明。但与前面介绍的蒙特卡洛控制一样,由于每次只选择\\(Q\\)值最大的动作,因此这个算法也会导致部分state-action对不会被策略生成,相应的动作值函数也无法得到更新。为了确保能收敛到最优策略,下面的算法在生成episode时同样使用了\\(\\epsilon-greedy\\)策略,但更新时仍然采用确定性策略(即策略只选择\\(Q\\)值最大的动作)。 DQN DQN改进算法 强化学习在内容推荐中的应用 参考资料 1、Reinforcement Learning: An Introduction, Richard S. Sutton and Andrew G. Barto,2012 2、Playing Atari with Deep Reinforcement Learning,DeepMind Technologies,Arxiv 2013.12 3、Human-level control through deep reinforcement learning,DeepMind Technologies,Nature 2015.02 4、DeepMind官网 https://deepmind.com/blog/deep-reinforcement-learning 5、https://www.nervanasys.com/demystifying-deep-reinforcement-learning 6、http://www.cnblogs.com/jinxulin/p/3511298.html 7、Introduction to Reinforcement Learning,David Silver 注释 1、截断任务:在强化学习中,非episode任务由于不存在终止状态,为了便于训练可以将非episode任务截断成episode。","categories":[{"name":"reinforcement learning","slug":"reinforcement-learning","permalink":"https://hjchen2.github.io/categories/reinforcement-learning/"}],"tags":[{"name":"reinforcement learning","slug":"reinforcement-learning","permalink":"https://hjchen2.github.io/tags/reinforcement-learning/"},{"name":"machine learning","slug":"machine-learning","permalink":"https://hjchen2.github.io/tags/machine-learning/"}]},{"title":"Mac上搭建基于Github的Hexo博客 — Testing","slug":"Mac上搭建基于Github的Hexo博客","date":"2017-03-22T13:31:08.000Z","updated":"2023-01-03T14:04:56.974Z","comments":true,"path":"2017/03/22/Mac上搭建基于Github的Hexo博客/","link":"","permalink":"https://hjchen2.github.io/2017/03/22/Mac%E4%B8%8A%E6%90%AD%E5%BB%BA%E5%9F%BA%E4%BA%8EGithub%E7%9A%84Hexo%E5%8D%9A%E5%AE%A2/","excerpt":"博客搭建 搭建过程请参考原文链接。 注意在mac上安装hexo时选择安装hexo-cli,否则可能会出现以下报错: [Error: Cannot find module './DTraceProviderBindings']","text":"博客搭建 搭建过程请参考原文链接。 注意在mac上安装hexo时选择安装hexo-cli,否则可能会出现以下报错: [Error: Cannot find module './DTraceProviderBindings'] 主题美化 简单说一下,下次有时间写个详细过程。主要做了以下修改: 使用NexT主题替换默认的landscape主题 简化了页脚,看起来更美观 修改了左边侧栏黑色背景,改成灰色 侧栏加入本地搜索功能 使用hypercomments评论插件,支持匿名评论","categories":[{"name":"Daily","slug":"Daily","permalink":"https://hjchen2.github.io/categories/Daily/"}],"tags":[{"name":"web technology","slug":"web-technology","permalink":"https://hjchen2.github.io/tags/web-technology/"}]},{"title":"caffe学习总结","slug":"caffe学习","date":"2017-01-19T04:31:08.000Z","updated":"2023-05-19T04:05:09.105Z","comments":true,"path":"2017/01/19/caffe学习/","link":"","permalink":"https://hjchen2.github.io/2017/01/19/caffe%E5%AD%A6%E4%B9%A0/","excerpt":"caffe学习总结 caffe的由来 caffe是贾扬清在UC Berkeley攻读计算机科学博士学位时开发的一套深度学习框架,由于高效、易读和模块化的设计,开源后经过nvidia的帮助优化和社区不断的完善,如今成为视觉领域主流的框架之一。","text":"caffe学习总结 caffe的由来 caffe是贾扬清在UC Berkeley攻读计算机科学博士学位时开发的一套深度学习框架,由于高效、易读和模块化的设计,开源后经过nvidia的帮助优化和社区不断的完善,如今成为视觉领域主流的框架之一。 贾扬清其人 清华大学的本硕,UC Berkeley的计算机科学博士,师承Prof. Trevor Darrell,期间在新加坡国立大学、微软亚洲研究院、NEC美国实验室和google研究院实习和工作。博士毕业后一直在google brain担任研究科学家,致力于机器视觉、深度学习和tensorflow相关工作。2016年2月加入facebook,主导facebook大多数AI应用的通用、大规模机器学习平台(目前以caffe2为基础的caffe2go已经开源)。 为什么要开发caffe 贾最早开发的是另一款软件Decaf,主要功能是基于cuda-convnet进行CNN训练。2013年贾扬清读博期间跟心理学老师合作研究使用概率框架来表达人的行为,\"但是因为图像上提取的特征比较弱,所以可以外推的结果比较有限\",而2012年Alex Krizhevsky提出的AlexNet在ImageNet比赛中大获成功,贾因此也希望将CNN应用到他们的心理学研究上,于是就开始写了Decaf,通过Decaf验证了\"深度学习特征的优异的可移植性\",因此就开始开发一套通用的深度学习框架,即后来的caffe。 caffe与其他一些主流框架的比较 caffe同期也存在其他一些开源框架,比如cuda-convnet、theano、torch等,并且后来又陆续开源了neon、mxnet、tensorflow、CNTK以及paddled等等。现在对于研究者,如何选择一个框架也成了一个麻烦的问题了。下图是2014年贾扬清在caffe论文中对当时的一些框架做的一个比较: 下面是近年主流框架的一个简单比较: 特性 主语言 从语言 硬件 分布式 命令式 声明式 自动梯度 caffe C++ Python/Matlab CPU/GPU ✖ ✖ ✔ ✖ mxnet C++ Python/R/Julia/Scala CPU/GPU/Mobile ✔ ✔ ✔ ✔ tensorflow C++ Python CPU/GPU/Mobile ✔ ✖ ✔ ✔ Torch Lua - CPU/GPU/FPGA ✔ ✔ ✖ ✔ theano Python - CPU/GPU ✖ ✖ ✔ ✔ 效率 caffe代码组织结构 caffe代码结构是非常清晰的,主要包含以下文件和目录: Makefile和Makefile.config caffe支持cmake和make两种编译方式,不过大部分人只需要用make编译就可以了。Makefile.config可以对一些编译选项进行配置,比如USE_MPI、CPU_ONLY、DEBUG等等。 include 在caffe中除了proto文件生成的头文件外,所有的c++头文件都放在include目录中。 src src与include的目录结构基本上相同,include目录中的文件基本上都能在src目录中找到对应的实现文件。 tools tools目录下是caffe提供给用户直接使用的接口,比如caffe.cpp用于模型训练、评估以及统计耗时,另外也提供一些数据集转换、计算均值等工具。 examples 提供一些训练相关的脚本和网络配置,比如数据预处理脚本、不同的网络配置文件以及训练脚本。 models 提供一些模型的网络配置文件,以及训练好的模型,用户可以直接用训练好的模型进行fine-tune或者分类。 matlab/python 提供matlab和python的接口。 caffe网络的组织方式 从LeNet开始,CNN就开始有了一个标准的分层结构——堆叠卷积层,卷积层可能后接一些normalization和pooling层,网络最后接一个或多个全连接层。由于梯度下降算法非常适合逐层计算,因此当时很多的通用框架都将网络(Net)抽象为多个数据处理层(Layer)组成的有向图,并支持灵活地定义网络结构。caffe将神经网络的训练问题分解为四个方面:数据、计算、流动控制以及问题求解,分别对应caffe中的Blob、Layer、Net和Solver。网络中流动的数据以及参数都用Blob来表示,Layer负责前向输出和后向梯度的计算,Net负责控制Layer计算的顺序,Solver是一个求解器的角色,根据Net的梯度对网络参数进行更新。 [待补充] caffe中的Blob及同步策略 Blob是caffe中存储数据的基本结构,可以简单理解为一个4维的数组,数据组织格式为(N,C,H,W)。在caffe中上下层流动的数据和每层的权重参数都是用Blob来保存的,为了便于使用,Blob具有一些特性: Blob的内存是懒分配的(lazily allocate),只有在真正使用的时候才会分配内存 Blob会在CPU和GPU上各自分配一块相同大小的内存,便于在CPU和GPU之间进行切换 用户不需要关心CPU和GPU数据的同步,Blob会根据需要自动同步 下面是Blob的成员变量,data_是Blob存储的数据,diff_保存的是数据的梯度,shape_data_和shape_保存的都是当前数组的形状,count_是当前数据的大小,capacity_是申请的内存的大小,避免每次Reshape都要释放并重新申请内存。 1234567// include/caffe/blob.hppshared_ptr<SyncedMemory> data_;shared_ptr<SyncedMemory> diff_;shared_ptr<SyncedMemory> shape_data_;vector<int> shape_;int count_;int capacity_; 下面主要说一下Blob的自动同步策略。首先看一下SyncedMemory的成员变量: 123456789// include/caffe/syncedmem.hppvoid* cpu_ptr_; // CPU内存数据void* gpu_ptr_; // GPU显存数据size_t size_; // 数据大小SyncedHead head_; // 同步标志bool own_cpu_data_; bool cpu_malloc_use_cuda_;bool own_gpu_data_;int gpu_device_; // GPU设备号 head_的取值范围为UNINITIALIZED, HEAD_AT_CPU, HEAD_AT_GPU, SYNCED。初始化时head_值为UNINITIALIZED,当调用Blob的取值函数时都会调用一次SyncedMemory的to_cpu或者to_gpu进行数据的同步,同步策略为: 1、取cpu数据时,会调用to_cpu函数,如果heda_为HEAD_AT_GPU,则需要将GPU的数据同步至CPU,否则不需要同步 2、取gpu数据时,会调用to_gpu函数,如果heda_为HEAD_AT_CPU,则需要将CPU的数据同步至GPU,否则不需要同步 head_标志的赋值: 1、每次调用SyncedMemory的mutable_cpu_data时,head_都会被置为HEAD_AT_CPU 2、每次调用SyncedMemory的mutable_gpu_data时,head_都会被置为HEAD_AT_GPU 3、每次同步之后heda_会被置为SYNCED。 因此Blob通过判断每次修改的位置来自行决定是否需要对不同设备间的两份数据进行同步,使用时就像只有一份数据一样,非常方便。 caffe中的Layer layer是caffe模型的主要组成部分和基本的计算单元,与很多框架中的operator对应,一个典型的layer在forward时从下层连接获取输入,经过计算后输出到上层,backward时又从上层连接获取误差,计算本层梯度和误差后,将误差传递到下层连接。因此基类Layer实现了三个基本函数setup、forward和backward。 setup:根据下层连接和配置参数完成本层参数的初始化,以及输出blobs的初始化 forward:前向计算过程,并计算本层的loss backward:后向计算过程,并将本层误差传递到下层 forward和backward里面都会对CPU和GPU进行分支,如果是CPU模式,则真正参与计算的是forward_cpu和backward_cpu,如果是GPU模式,则参与计算的是forward_gpu和backward_gpu,并且在基类中forward_gpu和backward_gpu分别调用的是forward_cpu和backward_cpu,当然用户在定义新的layer时可以自行实现forward_gpu和backward_gpu。 基类Layer的成员变量: 123456789101112// include/caffe/layer.hpp/** The protobuf that stores the layer parameters */LayerParameter layer_param_;/** The phase: TRAIN or TEST */Phase phase_;/** The vector that stores the learnable parameters as a set of blobs. */vector<shared_ptr<Blob<Dtype> > > blobs_;/** Vector indicating whether to compute the diff of each param blob. */vector<bool> param_propagate_down_;/** The vector that indicates whether each top blob has a non-zero weight in * the objective function. */vector<Dtype> loss_; layer_param_是从protobuf文件中反序列化得到的,存放的是layer的配置参数 phase_指示是训练还是测试 blobs_是本层的参数,比如权重和偏置 param_propagate_down_为每一个参数设定是否需要计算梯度 loss_是本层的损失值,loss层每个输出blob都有一个损失值,非loss层损失为0 由基类Layer直接或间接派生出各种layer,比如卷积(convolution)、全连接(fully connected或者inner product)、dropout、pooling、relu、softmaxWithLoss等等,每一个派生layer都会强制实现forward_cpu和backward_cpu。早期的caffe将layer分成5类, dataLayer类: 各类数据读取的接口 neuronLayer类: 各种激活函数、dropout visionLayer类: 卷积层、采样层等2D图像相关的运算 commonLayer类:全连接层和其他运算 lossLayer类:实现各种代价函数 不过目前最新版本的caffe已经取消了visionLayer和commonLayer的分类。此外由于caffe使用了cuDNN运算加速库,因此部分layer有caffe和cuDNN两种实现,使用时可以通过protobuf文件配置需要使用的engine。 为了保持框架的可扩展性,大多数框架在layer或者operator的实现中使用了工厂模式,使用统一的工厂类来对不同的layer或operator进行实例化。下面是caffe使用工厂模式的代码实现, 123456789101112131415161718192021222324252627282930313233// include/caffe/layer_factory.hpptemplate <typename Dtype>class LayerRegistry {public: typedef shared_ptr<Layer<Dtype> > (*Creator)(const LayerParameter&); typedef std::map<string, Creator> CreatorRegistry; static CreatorRegistry& Registry() { static CreatorRegistry* g_registry_ = new CreatorRegistry(); return *g_registry_; } // Adds a creator. static void AddCreator(const string& type, Creator creator) { CreatorRegistry& registry = Registry(); CHECK_EQ(registry.count(type), 0) << "Layer type " << type << " already registered."; registry[type] = creator; }...};template <typename Dtype>class LayerRegisterer { public: LayerRegisterer(const string& type, shared_ptr<Layer<Dtype> > (*creator)(const LayerParameter&)) { // LOG(INFO) << "Registering layer type: " << type; LayerRegistry<Dtype>::AddCreator(type, creator); }};#define REGISTER_LAYER_CREATOR(type, creator) \\ static LayerRegisterer<float> g_creator_f_##type(#type, creator<float>); \\ static LayerRegisterer<double> g_creator_d_##type(#type, creator<double>) \\ 12345678910111213141516171819202122// src/caffe/layer_factory.cpptemplate <typename Dtype>shared_ptr<Layer<Dtype> > GetSigmoidLayer(const LayerParameter& param) { SigmoidParameter_Engine engine = param.sigmoid_param().engine(); if (engine == SigmoidParameter_Engine_DEFAULT) { engine = SigmoidParameter_Engine_CAFFE;#ifdef USE_CUDNN engine = SigmoidParameter_Engine_CUDNN;#endif } if (engine == SigmoidParameter_Engine_CAFFE) { return shared_ptr<Layer<Dtype> >(new SigmoidLayer<Dtype>(param));#ifdef USE_CUDNN } else if (engine == SigmoidParameter_Engine_CUDNN) { return shared_ptr<Layer<Dtype> >(new CuDNNSigmoidLayer<Dtype>(param));#endif } else { LOG(FATAL) << "Layer " << param.name() << " has unknown engine."; }}REGISTER_LAYER_CREATOR(Sigmoid, GetSigmoidLayer); caffe中的Net Net是由Layer组成的有向图,表示整个神经网络的拓扑结构,与很多框架中的graph对应,一般用一个protobuf文件来定义。而且Layer作为有向图中的一个组件,是无法感知自己的上层和下层连接的,需要Net将数据feed给Layer,这样数据在有向图中才能真正流动起来。因此Net至少需要提供构建一个有向图和feed数据流两种功能。 构建一个有向图:void Init(const NetParameter& in_param) feed数据流: const vector<Blob<Dtype>*>& Forward(Dtype* loss)和void Backward() 在构建有向图时,caffe首先会对不符合规则的layer进行过滤,比如对于test net,则会把只用于train的layer过滤掉。对于有向图中可能存在分支的情况,caffe会自动插入split层,将原输入blob复制多份,分别输入不同的分支,比如:LeNet网络中的数据层的label需要输入到accuracy层和loss层,那么需要在数据层再插入一层,如下图所示。 Net会根据网络结构逐层创建layer,并指定输入输出blobs,以及是否需要backward。 1234567891011121314151617181920212223242526// src/caffe/net.cpp:Init...for (int layer_id = 0; layer_id < param.layer_size(); ++layer_id) { ... layers_.push_back(LayerRegistry<Dtype>::CreateLayer(layer_param)); ... for (int bottom_id = 0; bottom_id < layer_param.bottom_size(); ++bottom_id) { const int blob_id = AppendBottom(param, layer_id, bottom_id, &available_blobs, &blob_name_to_idx); // If a blob needs backward, this layer should provide it. need_backward |= blob_need_backward_[blob_id]; } int num_top = layer_param.top_size(); for (int top_id = 0; top_id < num_top; ++top_id) { AppendTop(param, layer_id, top_id, &available_blobs, &blob_name_to_idx); // Collect Input layer tops as Net inputs. if (layer_param.type() == "Input") { const int blob_id = blobs_.size() - 1; net_input_blob_indices_.push_back(blob_id); net_input_blobs_.push_back(blobs_[blob_id].get()); } } ... layers_[layer_id]->SetUp(bottom_vecs_[layer_id], top_vecs_[layer_id]); ...} 在训练时,train net会首先初始化,test net之后初始化,每次test时会调用ShareTrainedLayersWith共享train net的参数,这样做可以节省显存并且避免不必要的数据拷贝。 需要注意的是,在protobuf文件中声明网络结构时,必须依照从下到上的顺序一层一层定义网络参数,而且test net和train net对应层的name最好一致(虽然不一致可能不会导致程序报错),因为test net与train net是根据匹配name进行参数共享的,如果name不一致则会导致无法进行参数共享,增加显存消耗的同时还会导致test结果不正确。 当有向图构建完成后,我们只需要调用Forward和Backward,数据就能流经整个网络,得到每层的输出、loss和每个参数的梯度。 123456789101112131415161718192021222324252627// src/caffe/net.cpptemplate <typename Dtype>Dtype Net<Dtype>::ForwardFromTo(int start, int end) { CHECK_GE(start, 0); CHECK_LT(end, layers_.size()); Dtype loss = 0; for (int i = start; i <= end; ++i) { // LOG(ERROR) << "Forwarding " << layer_names_[i]; Dtype layer_loss = layers_[i]->Forward(bottom_vecs_[i], top_vecs_[i]); loss += layer_loss; if (debug_info_) { ForwardDebugInfo(i); } } return loss;}template <typename Dtype>void Net<Dtype>::BackwardFromTo(int start, int end) { CHECK_GE(end, 0); CHECK_LT(start, layers_.size()); for (int i = start; i >= end; --i) { if (layer_need_backward_[i]) { layers_[i]->Backward( top_vecs_[i], bottom_need_backward_[i], bottom_vecs_[i]); if (debug_info_) { BackwardDebugInfo(i); } } } } caffe中的Solver 前面讲到Net通过调用Forward和Backward可以得到每个参数的梯度,而Solver的主要作用就是根据这些梯度进行网络参数的更新。由于caffe将Net作为Solver的底层实现,因此Solver也就成了控制整个训练过程的中枢。Solver提供三个主要函数:Init、Solve、ApplyUpdate。 Init:创建训练网络和测试网络,初始化一些参数 12345678910111213// src/caffe/solver.cpptemplate <typename Dtype>void Solver<Dtype>::Init(const SolverParameter& param) { ... // Scaffolding code InitTrainNet(); if (Caffe::root_solver()) { InitTestNets(); LOG(INFO) << "Solver scaffolding done."; } iter_ = 0; current_step_ = 0;} Solve:调用Step进行迭代训练,每次迭代后都会调用ApplyUpdate进行参数的更新 123456789101112131415161718192021222324252627282930313233343536373839404142434445// src/caffe/solver.cpptemplate <typename Dtype>Dtype Solver<Dtype>::ForwardBackward() { ... // accumulate the loss and gradient for (int i = 0; i < param_.iter_size(); ++i) { loss += net_->ForwardBackward();\\ } return loss / param_.iter_size();} template <typename Dtype>void Solver<Dtype>::Step(int iters) { ... while (iter_ < stop_iter) { if (param_.test_interval() && iter_ % param_.test_interval() == 0 && (iter_ > 0 || param_.test_initialization()) && Caffe::root_solver()) { TestAll(); // 进行测试 } ... for (int i = 0; i < callbacks_.size(); ++i) { callbacks_[i]->on_start(); } ... Dtype loss = ForwardBackward(); ... UpdateSmoothedLoss(loss, start_iter, average_loss); ... for (int i = 0; i < callbacks_.size(); ++i) { callbacks_[i]->on_gradients_ready(); } if (!param().disabled_update()) { ApplyUpdate(); } ++iter_; ...} template <typename Dtype>void Solver<Dtype>::Solve(const char* resume_file) { ... Step(param_.max_iter() - iter_); ...} ApplyUpdate:调用对应的solver进行参数更新,下面是sgd solver的ApplyUpdate函数 12345678910111213// src/caffe/solvers/sgd_solver.cpptemplate <typename Dtype>void SGDSolver<Dtype>::ApplyUpdate() { ... Dtype rate = GetLearningRate(); //获取当前迭代的学习率 ... ClipGradients(); // 进行梯度规整 // learnable_params存放的是网络中所有需要学习的参数blobs for (int param_id = 0; param_id < this->net_->learnable_params().size(); ++param_id) { ApplyUpdate(param_id); // 逐个更新参数 }} 由于梯度下降算法发展出了非常多的优化算法,目前caffe提供了六种优化算法来求解最优参数,在solver配置文件中,通过设置type类型来选择。 Stochastic Gradient Descent (type: \"SGD\"), AdaDelta (type: \"AdaDelta\"), Adaptive Gradient (type: \"AdaGrad\"), Adam (type: \"Adam\"), Nesterov’s Accelerated Gradient (type: \"Nesterov\"), RMSprop (type: \"RMSProp\") caffe断点保存和恢复 由于训练过程往往非常耗时,为了能够在突发情况后快速恢复训练,caffe提供了断点保存和恢复的功能,在solver的配置文件中可以配置保存的频率及保存时文件名的前缀,一个比较完整的solver配置文件如下: 12345678910111213141516// solver.prototxtnet: "./train_val.prototxt" // 定义net的protobuf文件test_iter: 100 // 测试的迭代次数,这个需要根据测试数据的大小和测试时的batch size计算得到,test_iter = test_dataset_size / test_batch_sizetest_interval: 1000 // 设置test的频率,每训练1000次迭代就测试一次base_lr: 0.01 // 设置学习率lr_policy: "step" // 设置学习率衰减策略gamma: 0.1 // step衰减因子,stepsize: 10000 // 衰减的频率,每训练10000次迭代衰减一次,衰减后的学习率=当前学习率*gammadisplay: 500 // 训练log打印频率max_iter: 45000 // 设置最大训练多少次迭代type: "SGD" // 设置solver类型 momentum: 0.9 // 设置SGD中的动量项weight_decay: 0.0005 // 设置正则系数snapshot: 1000 // 设置模型保存频率snapshot_prefix: "../output/caffe_alexnet_train" // 设置模型保存时文件名前缀solver_mode: CPU // 设置训练模式,CPU还是GPU 当然还有一些其他的参数,比如正则化类型和模型保存文件格式等,都会使用在proto文件中定义的默认值,具体查看src/caffe/proto/caffe.proto文件中的SolverParameter。 为了实现断点保存和恢复,caffe在Solver中加入了Snapshot和Restore,分别进行模型保存和模型恢复,相应地,在Net中也加入了ToProto/ToHDF5和CopyTrainedLayersFromBinaryProto/CopyTrainedLayersFromHDF5。Solver调用Step进行训练的时候,每次参数更新结束都会判断是否需要保存模型。 1234567// src/caffe/solver.cpp:Stepif ((param_.snapshot() && iter_ % param_.snapshot() == 0 && Caffe::root_solver()) || (request == SolverAction::SNAPSHOT)) { Snapshot();} Solver中Snapshot对模型参数和训练状态进行保存,模型参数提供两种保存格式——binary protobuf和hdf5。如果是protobuf格式,则会调用Net的ToProto,否则调用ToHDF5。 123456789101112// src/caffe/net.cpptemplate <typename Dtype>void Net<Dtype>::ToProto(NetParameter* param, bool write_diff) const { param->Clear(); param->set_name(name_); // Add bottom and top DLOG(INFO) << "Serializing " << layers_.size() << " layers"; for (int i = 0; i < layers_.size(); ++i) { LayerParameter* layer_param = param->add_layer(); layers_[i]->ToProto(layer_param, write_diff); }} Solver在开始训练时会尝试调用Restore进行断点恢复,根据文件名后缀判断文件格式,并选择RestoreSolverStateFromHDF5还是RestoreSolverStateFromBinaryProto。 12345// src/caffe/solver.cpp:Solveif (resume_file) { LOG(INFO) << "Restoring previous solver status from " << resume_file; Restore(resume_file);} in-place计算 为了节约显存,caffe支持原址计算,就是输入与输出都是同一个blob。如果前一层的输出和本层的输入都与后向计算时无关,而且本层的输入和输出blob大小相同,就可以使用in-place计算,比如卷积层后面的Sigmoid、Relu等都可以用同址计算,而BatchNorm层也支持in-place计算,是因为BatchNorm在实现时会将输入数据进行备份。使用同址计算只要在protobuf文件中指定该层的top和bottom是同名的就可以了,比如: 123456layer { bottom: "conv1" top: "conv1" name: "conv1_relu" type: "ReLU"} 参数初始化方法 由于神经网络的目标函数往往是非凸的,参数初始化会对最终的收敛结果造成非常大的影响。为了满足不同的参数初始化需求,caffe提供了多种初始化方法,并且在net的配置文件中可以为每个参数选择一个初始化方法。比如下面的weight_filler和bias_filler: 12345678910111213141516171819layer { bottom: "data" top: "conv1" name: "conv1" type: "Convolution" convolution_param { num_output: 64 kernel_size: 7 pad: 3 stride: 2 weight_filler { type: "xavier" } bias_filler { type: "constant" value: 0.2 } }} 在include/caffe/filler.hpp中caffe提供如下的初始化方法: constant:常量初始化,参数所有的值都被初始化为相同的值 uniform:均匀初始化,参数的值按照指定区间均匀分布随机初始化 gaussian:高斯初始化,参数的值按照指定均值和方差的正态分布随机初始化 positive unitball xavier:本质上也是一种指定区间均匀分布的随机初始化方式,只是区间是通过参数大小计算得到 msra:与xavier类似,不过使用的是指定均值和方差的正态分布随机初始化方式 bilinear 多卡并行策略 为了提高效率,caffe支持单机多GPU并行训练,目前采用的是数据并行方式,暂不支持模型并行,为此caffe增加了一个P2PSync类,下面主要介绍一下P2PSync如何实现多卡并行的。 P2PSync封装了一个Solver负责训练,每张GPU都会对应一个P2PSync,并且P2PSync之间具有主从关系,它们之间构成一个二叉树的结构。在前向计算时,主P2PSync需要将模型分发给从P2PSync,而在后向传导时,从P2PSync就需要把梯度传给主P2PSync,主P2PSync会在聚合从P2PSync的梯度后传给更上一层的主P2PSync。在二叉树结构中,根节点P2PSync的Solver被叫做root solver,其他solver叫做worker solver,只有root solver才能进行参数更新,worker solver只是将梯度聚合并传递给root solver。 在P2PSync中主要的函数就InternalThreadEntry、on_start和on_gradients_ready。 1234567// src/caffe/parallel.cpptemplate<typename Dtype>void P2PSync<Dtype>::InternalThreadEntry() {... solver_->Step(solver_->param().max_iter() - initial_iter_); } InternalThreadEntry是一个线程函数,Solver调用Step进行训练,在Step中每次前向计算前都会回调on_start获取最新模型,而在后向计算结束后又会回调on_gradients_ready传递梯度。 123456789101112131415161718192021// src/caffe/solver.cpptemplate <typename Dtype>void Solver<Dtype>::Step(int iters) { ... while (iter_ < stop_iter) { for (int i = 0; i < callbacks_.size(); ++i) { callbacks_[i]->on_start(); // 回调P2PSync中的on_start,从主P2PSync获取新模型 } ... Dtype loss = Forward_backward(); ... for (int i = 0; i < callbacks_.size(); ++i) { callbacks_[i]->on_gradients_ready(); // 回调P2PSync中的on_gradients_ready,依次聚合从P2PSync和自身的梯度,并将梯度发送给主P2PSync } if (!param().disabled_update()) { ApplyUpdate(); // 这里只有root solver才会进行参数更新 } ... }} 1234567891011121314151617181920212223242526template<typename Dtype>void P2PSync<Dtype>::on_start() {#ifndef CPU_ONLY ... // Wait for update from parent if (parent_) { /* 除了root solver,其他worker solver都有对应的parent 程序执行到这里时会阻塞,当主P2PSync将自身入队后就会通知从P2PSync,pop就能返回 */ P2PSync<Dtype> *parent = queue_.pop(); // 等待主P2PSync入队 CHECK(parent == parent_); } // Update children for (int i = children_.size() - 1; i >= 0; i--) { Dtype* src = data_; Dtype* dst = children_[i]->data_; ... // 主P2PSync将模型直接拷贝给从P2PSync CUDA_CHECK(cudaMemcpyAsync(dst, src, size_ * sizeof(Dtype), cudaMemcpyDeviceToDevice, cudaStreamDefault)); CUDA_CHECK(cudaStreamSynchronize(cudaStreamDefault)); // 主P2PSync将自身入队,并通知从P2PSync children_[i]->queue_.push(this); } #endif } 123456789101112131415161718192021222324252627282930template<typename Dtype>void P2PSync<Dtype>::on_gradients_ready() {#ifndef CPU_ONLY ... // Sum children gradients as they appear in the queue for (int i = 0; i < children_.size(); ++i) { P2PSync<Dtype> *child = queue_.pop(); // 等待从P2PSync入队 // 由于parent_grads_是在主P2PSync设备上开辟的一块缓冲区,因此child->parent_grads_其实就是当前设备上的缓冲区 Dtype* src = child->parent_grads_; // 获取从P2PSync的梯度 Dtype* dst = diff_; // 合并从P2PSync的梯度 caffe_gpu_add(size_, src, dst, dst); } ... // Send gradients to parent if (parent_) { Dtype* src = diff_; Dtype* dst = parent_grads_; // 从P2PSync将梯度复制到主P2PSync的缓冲区 CUDA_CHECK(cudaMemcpyAsync(dst, src, size_ * sizeof(Dtype), // cudaMemcpyDeviceToDevice, cudaStreamDefault)); CUDA_CHECK(cudaStreamSynchronize(cudaStreamDefault)); // 自身入队,通知主P2PSync parent_->queue_.push(this); } else { // Loss functions divide gradients by the batch size, so to compensate // for split batch, the root solver divides by number of solvers. caffe_gpu_scal(size_, Dtype(1.0 / Caffe::solver_count()), diff_); }#endif intel caffe多机并行策略 单机多卡的训练方式已经足够解决目前大部分模型训练的需求了,但随着数据量越来越大、模型越来越复杂,分布式异构计算成为行业通行的解决方案。BVLC caffe是不支持分布式训练的,intel有两个部门将caffe进行了再次开发以支持分布式和最新的Intel MKL-DNN,分别为intel caffe和caffe multinode。目前BML API已经支持intel caffe的模型训练、评估和预测了。 intel caffe采用的是数据并行的方式,但不同于目前主流的centralized parameter server通信模型,intel caffe借鉴了单机多卡的策略,采用的是一种all-reduce的binary tree模型,也就是将节点按照二叉树组织起来,每个父节点负责1-2个子节点和自己父节点的通信,相比一个中心的PS需要同时与其他多个节点通信的方式,这种binary tree方式将一部分PS的计算平均到了每个节点上,而且相同level的父节点之间可以并行,增加了梯度合并的并行度。 [待图] 为了更好地掩盖通信开销,子节点不需要等到整个模型的梯度都计算完才发送,而是每个layer计算完梯度后就会立即发送给父节点,父节点收到所有子节点的梯度后将本层的梯度合并后也可以立即发送给上一层的父节点。每个layer的参数会按照buffer的大小分成多个part,每个part都会异步地进行发送,当进行下一次迭代时,除了根节点的所有节点都会被阻塞,等待根节点将最终的梯度进行合并,并更新模型后发送给子节点。 除了分层通信外,intel caffe也支持梯度量化压缩,可以将全精浮点数编码成指定字节数的数值,减少节点间通信量。 intel caffe为了支持多种协议的通信,使用了boost的asio::io_service接口,底层实现支持MPI、TCP和UDP,不过目前只实现了MPI接口。 训练时交叉验证是在单节点(准确来说是根节点)上进行的,但每个节点上都需要存在验证集文件,这是因为即使不进行test,其他节点也会初始化test网络。 实战 参考 贾扬清自述http://www.yangfenzi.com/keji/59535.html caffe官网http://caffe.berkeleyvision.org http://ucb-icsi-vision-group.github.io/caffe-paper/caffe.pdf https://www.zhihu.com/question/27982282 http://blog.csdn.net/myarrow/article/details/52064608","categories":[{"name":"ML framework","slug":"ML-framework","permalink":"https://hjchen2.github.io/categories/ML-framework/"}],"tags":[{"name":"caffe","slug":"caffe","permalink":"https://hjchen2.github.io/tags/caffe/"},{"name":"deep learning","slug":"deep-learning","permalink":"https://hjchen2.github.io/tags/deep-learning/"},{"name":"framework","slug":"framework","permalink":"https://hjchen2.github.io/tags/framework/"}]},{"title":"决策树在Kaldi中如何使用","slug":"Kaldi决策树如何使用","date":"2016-06-08T06:54:04.000Z","updated":"2023-02-07T02:40:38.882Z","comments":true,"path":"2016/06/08/Kaldi决策树如何使用/","link":"","permalink":"https://hjchen2.github.io/2016/06/08/Kaldi%E5%86%B3%E7%AD%96%E6%A0%91%E5%A6%82%E4%BD%95%E4%BD%BF%E7%94%A8/","excerpt":"说明:本文是kaldi主页相关内容的翻译(http://kaldi-asr.org/doc/tree_externals.html )。目前网上已经有一个翻译的版本,但翻译的不是很清楚,导致我在刚学这部分内容的时候产生了一些误解,所以我希望结合我目前所知道的一些东西,尽量把这部分内容翻译地比较容易理解,但由于也是初学者,一些错误也是不可避免,希望大家发现后一起交流,以便我后期修正。好了,还是废话少说吧。","text":"说明:本文是kaldi主页相关内容的翻译(http://kaldi-asr.org/doc/tree_externals.html )。目前网上已经有一个翻译的版本,但翻译的不是很清楚,导致我在刚学这部分内容的时候产生了一些误解,所以我希望结合我目前所知道的一些东西,尽量把这部分内容翻译地比较容易理解,但由于也是初学者,一些错误也是不可避免,希望大家发现后一起交流,以便我后期修正。好了,还是废话少说吧。 介绍(Introduction) 本页将对声学决策树在kaldi中如何被创建和使用,以及如何在训练和解码图构建过程进行运用给出一个概述性的解释。对于构建决策树代码的内部描述,请参见Decision tree internals;对于构建解码图方法的详细信息,可以参见Decoding graph construction in Kaldi。 实现的基本算法就是自顶向下的贪婪分裂,通过问一些问题,比如说左边的音素,右边的音素,中心音素以及当前的状态等等,我们会得到很多可以把数据进行分裂的路径。我们实现的算法与标准算法非常相似,请参见Young,Odell和Woodland的这篇论文\"Tree-based State Tying for High Accuracy Acoustic Modeling\" 。假设我们对数据建模时采用单高斯将它们分成两部分,在这个算法中,我们通过选择局部最优的问题进行数据分裂,也就是使得似然值增加最大的那个问题。与标准算法实现不同的地方包括可以自由配置树的根节点;对HMM状态和中心音素相关问题提问的能力;以及实际上在Kaldi脚本中默认情况下,问题集是通过对数据自顶向下的二分聚类自动生成的,这就意味着不需要手动去创建问题集。关于树的根节点的配置:可能是把一个共享的群组里面所有音素分裂的统计量,或者独立的音素,或者每个音素的HMM状态,作为树的根节点来进行分裂,或者把音素组作为树的根节点(注:多个音素作为一棵树的根节点)。对于如何用标准的脚本配置根节点,请参见Data preparation。实际上,我们一般让每棵树的根节点都对应一个真实的音素(real phone),意思就是说我们把每个音素的词位置相关、发音相关或者音调相关的所有变种都放进一个音素组,作为决策树的根节点。 本页下面主要给出相关代码层面的一些详细信息。 音素上下文窗(Phonetic context windows) 这里我们解释一下在代码中我们怎样描述一个音素的上下文。一棵特殊的决策树将有两个整型值,分别描述的是上下文窗的宽度和中心位置。下表简单说明了这两个值: N是上下文窗的宽度,P是设计的中心音素的标记。一般P就是窗的中心(因此叫中心位置);举例说,当N=3时我们一般设P=1,但是我们也可以从0到N-1自由选择;比如,P=2和N=3意味着有左上下文有两个音素,并且没有右上下文。在代码中,当我们讨论中心音素时,我们总是认为讨论的是第P个音素,可能是也可能不是上下文窗中心的那个音素。 一个用来表示典型的triphone上下文窗的整型向量可能是: 12//probably not valid C++vector<int32> ctx_window = { 12, 15, 21 }; 假设N=3和P=1,这个表示有一个右上下文21和一个左上下文12的音素15。我们处理端点位置上下文的一个方式是使用0(0不是一个合法的音素,因为在OpenFst中0是为空符号epsilon而保留的),所以比如: 1vector<int32> ctx_window = { 12, 15, 0}; 表示有一个左上下文12和没有右上下文的音素15,因为音素15是句子的结尾。在句子结尾这种特殊的地方,0这种方式的使用可能有一点意外,因为最后一个“音素”实际上是后续符号“$”(参见Making the context transducer),但是为了在决策树代码中的便利,我们不把后续符号放进上下文窗,而是把0放进去。注意,如果此时我们N=3和P=2,那上述的上下文窗是非法的,因为第P个元素是一个不能表示任何真实音素的0;当然同样如果我们有一个N=1的树,上面的窗都是不合法的,因为那些窗的大小都是错误的。在单音素的情况下,我们可以有一个如下的窗: 1vector<int32> ctx_window = { 15 }; 因此单音素系统只是被当成上下文相关系统的一种特殊情况,窗的大小N等于1,并且还有一棵什么都不做的树(注:经过这棵树后没有任何参数被绑定)。 树的构建过程(The tree building process) 在这部分我们给出Kaldi中树构建过程的一个概述。 即使是单音素系统也有一个决策树,但是是一个无用的树。参见返回这样一个无用树的函数MonophoneContextDependency() 和 MonophoneContextDependencyShared()。这两个函数被命令行程序gmm-init-mono调用;它主要的输入参数是HmmTopology对象,并且输出一棵树,这棵树通常会被以ContextDependency类型的对象写到一个叫做“tree”的文件中,以及模型文件(模型文件包含一个TransitionModel对象和一个AmDiagGmm对象)。如果程序gmm-init-mono接受一个叫-shared-phones的可选参数,它将会在指定的音素序列间共享pdfs(注:输出概率密度函数,比如高斯),否则它会使得所有的音素都是独立的。 从一个扁平的初始(注:除了sil,所有的单音素模型都是一样的)开始训练一个单音素系统后,我们拿单音素对齐的结果和使用函数AccumulateTreeStats()(被acc-tree-stats调用)来累积训练决策树的统计量。这个程序不限于读取单音素的对齐结果;它也能读取上下文相关的对齐结果,因此我们也可以基于triphone对齐结果来构建树。构建树的统计量以BuildTreeStatsType类型(参见Statistics for building the tree)被写到磁盘。函数AccumulateTreeStats()输入N和P的值,N和P就是上文解释过的上下文窗的大小和中心音素位置。命令行程序会默认地将N和P设为3和1,但是也可以使用–context-width和–central-position可选参数进行覆盖。程序acc-tree-stats输入一个上下文无关的音素列表(比如,silence),但是即使存在上下文无关的音素,这个也不是必需的;它只是减少统计量大小的一个机制。对于上下文无关的音素,程序将会累积一个没有定义keys的相关的统计量,keys是跟左右音素对应的(注:在代码中会把一个音素不同的上下文和pdf-class分别作为不同的key,然后累积每个key的统计量)(c.f. Event maps)。 当统计量被积累后,我们使用程序build-tree来构建树。这个程序输出一棵树。程序build-tree需要三样东西: 统计量(BuildTreeStatsType类型) 问题集配置(Questions类型) roots文件(参见下面) 统计量一般从程序acc-tree-stats得到;问题集配置类可以用程序compile-questions输出,compile-questions输入一个声学问题集的拓扑列表(在我们的脚本中,这些都是自动地从构建树的统计量通过程序cluster-phones得到)(注:cluster-phones输入构建树的统计量可以得到一个声学问题集)。roots文件指定了将要在决策树聚类过程中共享根节点的音素集,并且对每个音素集指出下面两个东西: “shared”或者“not-shared”指出是每个pdf-class(也就是一般情况下的HMM状态)都有不同的根节点,还是所有pdf-class共享一个根节点。如果是“shared”,对于所有的HMM状态(比如在正常的HMM拓扑下所有的三个状态)将只会有一个树根节点;如果是“not-shared”,将会有三个树根节点,每个pdf-class有一个。 “split”或者“not-split”指出对于根节点要不要根据问题进行决策树分裂(对于silence,我们一般不分裂)。如果该行指定“split”(正常情况),那么我们进行决策树分裂。如果指定“not-split”,那么就不会进行分裂,因此根节点就被无分裂地保留。 下面将对这个怎样使用方面做一些阐述: 如果我们指定“shared split”,即使所有的三个HMM状态有一个根节点,不同的HMM状态仍然可以到达不同的叶子节点,因为树可以像对声学上下文的问题提问一样对pdf-class的问题提问。 对于roots文件中同一行出现的所有音素,我们总是让它们共享根节点。如果你不想共享音素的根节点,你只要把它们放在不同的行。 下面是roots文件的一个例子;假设音素1是silence,并且其他的音素都有不同的根节点。 1234not-shared not-split 1shared split 2...shared split 28 当我们有比如位置和声调相关的音素时,将多个音素放在同一行会非常有用;这样每个“真实的“音素将关联到一个整数的音素ID集合。在这种情况下我们将particular underlying(注:这个不知道怎么翻译)音素的所有变种版本共享一个根节点。下面是来自egs/wsj/s5脚本中Wall Street Journal的roots文件的一个例子(这个例子中音素是用文本表示的,而不是整数形式;但在被Kaldi读取之前会被转换成整数形式(注:就是会把音素映射成整数的ID)): 12345678910not-shared not-split SIL SIL_B SIL_E SIL_I SIL_S SPN SPN_B SPN_E SPN_I SPN_S NSN NSN_B NSN_E NSN_I NSN_Sshared split AA_B AA_E AA_I AA_S AA0_B AA0_E AA0_I AA0_S AA1_B AA1_E AA1_I AA1_S AA2_B AA2_E AA2_I AA2_Sshared split AE_B AE_E AE_I AE_S AE0_B AE0_E AE0_I AE0_S AE1_B AE1_E AE1_I AE1_S AE2_B AE2_E AE2_I AE2_Sshared split AH_B AH_E AH_I AH_S AH0_B AH0_E AH0_I AH0_S AH1_B AH1_E AH1_I AH1_S AH2_B AH2_E AH2_I AH2_Sshared split AO_B AO_E AO_I AO_S AO0_B AO0_E AO0_I AO0_S AO1_B AO1_E AO1_I AO1_S AO2_B AO2_E AO2_I AO2_Sshared split AW_B AW_E AW_I AW_S AW0_B AW0_E AW0_I AW0_S AW1_B AW1_E AW1_I AW1_S AW2_B AW2_E AW2_I AW2_Sshared split AY_B AY_E AY_I AY_S AY0_B AY0_E AY0_I AY0_S AY1_B AY1_E AY1_I AY1_S AY2_B AY2_E AY2_I AY2_Sshared split B_B B_E B_I B_Sshared split CH_B CH_E CH_I CH_Sshared split D_B D_E D_I D_S 当创建这个roots文件时,你应该确保在每一行至少有一个音素是可见的(注:有对应的训练样本)。比如上面的情况,如果音素AY至少在声调和词位置的某些连接中可见,那就没问题。 在这个例子中,对于slience等音素我们有很多的词位置相关的变种。它们将共享它们的pdf's,因为它们都在同一行,并且是“not-split”,但是它们可能会有不同的状态转移参数。实际上,silence的大多数变种都不可能用到,因为silence不可能出现在词与词之间;这只是为了防止以后有人做一些奇怪的事而不会过时。 我们用从之前创建的模型(比如,单音素模型)得到的对齐结果来对混合高斯参数进行初始化;对齐的结果会被程序convert-ali从一棵树转换到另一棵(注:应该就是说对齐的transition不变,但状态绑定的参数可能因为决策树的不同而变化)。 PDF标号(PDF identifiers) PDF标号(pdf-id)是一个从0开始的数字,用做概率密度函数(p.d.f.)的序号。系统中每一个p.d.f.都有自己的pdf-id,并且是连续的(在一个LVCSR系统中一般会有几千个)。在树首先被构建时,它们就会被赋值。对于每一个pdf-id对应的是哪个音素,可能知道也可能不知道,这取决于树是怎样被构建的。 上下文相关对象(Context dependency objects) ContextDependencyInterface对象是树的一个虚基类,指定了如何与构建解码图代码进行交互。这个接口只包含四个函数: ContextWidth()返回树需要的N(上下文窗的大小)的值。 CentralPosition()返回树需要的P(窗中心位置)的值 NumPdfs()返回树定义的pdfs的数量;pdfs的编号从0到NumPdfs()-1。 Compute()是对某个特殊的上下文计算它对应的pdf-id的函数 ContextDependencyInterface::Compute()函数的声明如下: 12345class ContextDependencyInterface { ... virtual bool Compute(const std::vector<int32> &phoneseq, int32 pdf_class, int32 *pdf_id) const;} 如果能计算得到上下文和pdf-class对应的pdf-id,函数返回true。返回false时表明出现了一些错误或者是不匹配。这个函数使用的一个例子: 12345678ContextDependencyInterface *ctx_dep = ... ;vector<int32> ctx_window = { 12, 15, 21 }; // not valid C++int32 pdf_class = 1; // probably central state of 3-state HMM.int32 pdf_id;if(!ctx_dep->Compute(ctx_window, pdf_class, &pdf_id)) KALDI_ERR << "Something went wrong!"else KALDI_LOG << "Got pdf-id, it is " << pdf_id; 目前唯一继承ContextDependencyInterface的类就是ContextDependency,ContextDependency有少量更丰富的接口;唯一主要的添加就是函数GetPdfInfo,被用于TransitionModel类算出一个特殊的pdf可能对应哪些音素(这个函数的功能可以被 ContextDependencyInterface接口遍历所有的上下文而实现)。 ContextDependency对象实际上是对EventMap对象的简单组合封装;请参见Decision tree internals。我们希望尽可能地隐藏树的真正实现,使得以后需要重构代码时变得非常简单。 决策树的一个例子(An example of a decision tree) 决策树文件的格式不是以人们的可读性为首要目标而创建的,但由于大家需要我们在这里试着解释如何去解读这个文件。请看下面的例子,这个是一个来自Wall Street Journal脚本中triphone的决策树。它以这个对象的名字ContextDependency开始(注:在代码中整个树是一个ContextDependency对象);然后是N(上下文窗的大小),这里是3;接着是P(上下文窗的中心位置),这里是1。文件剩下的部分包含单个EventMap对象。EventMap是一个可能包含指向其他EventMap指针的多态类型。更多详细信息,请参见Event maps。这个文件表示一棵决策树或多棵决策树的集合,并将一个键值对集合(比如,left-phone=5, central-phone=10, right-phone=11, pdf-class=2(注:注意这里是四个键值对,表示一个中心音素是10,上文是音素5,下文是音素11的triphone的第2个状态))映射到一个pdf-id(比如,158)。简单来说,一个决策树包含三种基本类型:一个是SplitEventMap(就像决策树中的分支判断),一个是ConstantEventMap(就像决策树的叶子节点,只包含一个表示pdf-id的数字),和一个是TableEventMap(就像是一个包含其他EventMaps的一个查找表)。SplitEventMap和TableEventMap都有一个需要它们判断的key,这个值可能是0,1或者2,分别表示左上下文音素,中心音素和右上下文音素,也可能是-1,表示pdf-class的标号(注:如果HMM的每个状态都有对应的pdf,则pdf-class可理解为HMM的第几个状态)。一般情况,pdf-class的值与HMM状态的序号是相同的,比如0,1或2。请尝试不要因此而感到困惑:key是-1,value是0,1或2,但它们与上下文窗中音素的keys 0,1或2是没有任何关系的(注:上下文窗中0,1和2表示的是窗中音素的位置)。SplitEventMap有一系列值可以触发决策树的yes分支。下面是一种quasi-BNF符号表示的决策树文件格式。 1234 EventMap := ConstantEventMap | SplitEventMap | TableEventMap | "NULL"ConstantEventMap := "CE" <numeric pdf-id> SplitEventMap := "SE" <key-to-split-on> "[" yes-value-list "]" "{" EventMap EventMap "}" TableEventMap := "TE" <key-to-split-on> <table-size> "(" EventMapList ")" 在下面的例子中,树顶层的EventMap是一个以key 1进行分裂的SplitEventMap,也就是按中心音素分裂。在方括号中是一系列连续范围的phone-ids。然而,这些并不表示一个问题,它们只是音素分裂的一种方法,因此我们可以得到每个音素真正的决策树(注:音素真正的决策树是根据音素上下文和pdf-class进行决策的,对中心音素的决策只是为了找到这个音素对应的真正的决策树)。问题在于这棵树是通过“shared roots”方式创建的,所以有很多与同一音素不同词位置和音调标识相关的phone-ids,它们都共享树的根节点。在这种情况下在树的顶层我们不能使用TableEventMap,否则我们就不得不将每棵树重复好几遍(因为EventMap是一棵纯树,而不是一个通用的图,它没有指针共享的机制)。文件后面的一些“SE”标签也是quasi-tree的一部分,它们都是首先按中心音素进行分裂(当我们顺着文件往下看时我们进入了树的更深处;注意这个花括号“{”一直是打开的,还没有关闭)。然后我们看到字符串“TE -1 5 ( CE 0 CE 1 CE 2 CE 3 CE 4 ) ”,表示通过TableEventMap对pdf-class -1进行分裂(实际上就是,HMM-position),并且返回从0到4的值。这5个值表示的是静音和噪声音素SIL,NSN和SPN的5个pdf-ids。在我们的设定中,这三个非语音音素的pdfs是共享的(只有转移矩阵是不同的)。注意:对于这些音素我们用5状态而不是3状态的HMM,所以这里有5个不同的pdf-ids。接下来是“SE -1 [ 0 ] ”,这可以被认为是这棵树中第一个真正的问题。我们可以从上面的SE问题看出这个问题被应用于中心音素为4到19时候,也就是音素AA的不同版本(注:原文写的是5到19,不过我认为原文有问题,改成了4到19)。这个问题问的是pdf-class(key -1)是不是0(即是不是最左边的HMM-state)。下一个问题是“SE 2 [ 220 221 222 223 ]”,问的是音素右上下文是不是音素“M”不同形式中的一个(这是一个非常有效的问题,因为我们是在最左边的HMM-state);如果问题的答案是yes,我们继续问“SE 0 [ 104 105 106 107... 286 287 ]”,这是一个关于音素左上下文的问题(注:原文写的是右上下文,但应该是左上下文);如果答案是yes,则pdf-id就是5(“CE 5”),否则就是696(“CE 696”)。 123456789101112131415161718192021222324252627s3# copy-tree --binary=false exp/tri1/tree - 2>/dev/null | head -100ContextDependency 3 1 ToPdf SE 1 [ 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 \\26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59\\ 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 9\\3 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 1\\20 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 14\\5 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170\\ 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 \\196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 ]{ SE 1 [ 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34\\ 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 6\\8 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 10\\1 102 103 104 105 106 107 108 109 110 111 ]{ SE 1 [ 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34\\ 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 ]{ SE 1 [ 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 ]{ SE 1 [ 1 2 3 ]{ TE -1 5 ( CE 0 CE 1 CE 2 CE 3 CE 4 )SE -1 [ 0 ]{ SE 2 [ 220 221 222 223 ]{ SE 0 [ 104 105 106 107 112 113 114 115 172 173 174 175 208 209 210 211 212 213 214 215 264 265 266 \\267 280 281 282 283 284 285 286 287 ]{ CE 5 CE 696 }SE 2 [ 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 132 \\133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 248 249 250 251 252 253 254 255 256 257 2\\58 259 260 261 262 263 268 269 270 271 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 30\\3 ] 下面是一个更简单的例子:来自Resource Management脚本的单音素决策树。顶层的EventMap是一个TableEventMap(“TE 0 49 ...”)。key 0是音素位置0,表示中心(并且只有这一个)音素,因为上下文窗大小(N)为1。TE的条目数量是49(音素的数量加1)。表中第一个EventMap是NULL,因为没有序号为0的音素。下一个EventMap是一个有三个元素的TableEventMap,关联到第一个音素的三个HMM状态(技术上来说,是pdf-class):“TE -1 3 ( CE 0 CE 1 CE 2 )”。 123456s3# copy-tree --binary=false exp/mono/tree - 2>/dev/null| head -5ContextDependency 1 0 ToPdf TE 0 49 ( NULL TE -1 3 ( CE 0 CE 1 CE 2 )TE -1 3 ( CE 3 CE 4 CE 5 )TE -1 3 ( CE 6 CE 7 CE 8 )TE -1 3 ( CE 9 CE 10 CE 11 )TE -1 3 ( CE 12 CE 13 CE 14 ) ## 输入符号信息对象(The ilabel_info object) CLG图(请参见Decoding graph construction in Kaldi)在它的输入符号位置上有表示上下文相关音素的符号(辅助符号和可能的空符号也一样)。在图中它们总是用整型的标签来表示。在代码和文件名中,我们使用一个叫做ilable_info的对象。ilable_info对象跟ContextFst对象有很密切的联系,请参见see The ContextFst object。就跟许多其他的Kaldi类型一样,ilabel_info也是一个通用的(STL)类型,但是为了可以辨别出是ilabel_info,我们使用与之相同的变量名。就是下面定义的类型: 1std::vector<std::vector<int32> > ilabel_info; 它是一个以FST输入标签为索引的vector,给每一个输入标签一个对应的音素上下文窗(参见上文,Phonetic context windows)。比如,假设符号1500是左上下文是12和右上下文是4的音素30,我们有: 12// not valid C++ilabel_info[1500] == { 4, 30, 12 }; 在单音素的情况下,我们就会像这样: 1ilabel_info[30] == { 28 } 处理辅助符号会有点特殊(参见Disambiguation symbols或者上面引用的Springer Handbook文献,该文献解释了这些辅助符号是什么)。如果一条ilabel_info记录对应到一个辅助符号,我们就把辅助符号的符号表序号取负值放进去(注意这跟辅助符号打印形式#0,#1,#2等等里面的数字是不一样的,它是跟这些辅助符号在符号表文件中的顺序相关的数字,这个符号表文件在我们现在的脚本中叫做phones_disambig.txt)。比如, 1ilabel_info[5] == { -42 } 意味着在HCLG中符号数5对应到整数id是42的辅助符号。为了编程方便我们对这些id取负号,因此解析ilable_info对象的程序不需要给一个辅助符号的列表就可以在单音素情况下将它们跟真实的音素进行区分。有两个额外特殊情况: 123ilabel_info[0] == { }; // epsilonilabel_info[1] == { 0 }; // disambig symbol #-1;// we use symbol 1, but don't consider this hardwired. 第一个是正常的空符号,我们给它一个空的vector作为它的ilabel_info。这个符号一般不会出现在CLG的左边(注:应该是说不会作为CLG的输入符号)。第二个是一个特殊的辅助符号,打印形式叫做“#-1”。在epsilons被用做标准(Springer Handbook)脚本中C转换器输入符号的时候,我们使用辅助符号“#-1”。它可以确保有空音素表示的词的CLG网络可以被确定化。 程序fstmakecontextsyms可以创建一个与ilabel_info对象打印形式对应的符号表;这个主要用于调试和诊断错误。","categories":[{"name":"kaldi, decision tree, 决策树","slug":"kaldi-decision-tree-决策树","permalink":"https://hjchen2.github.io/categories/kaldi-decision-tree-%E5%86%B3%E7%AD%96%E6%A0%91/"}],"tags":[{"name":"kaldi","slug":"kaldi","permalink":"https://hjchen2.github.io/tags/kaldi/"},{"name":"decision tree","slug":"decision-tree","permalink":"https://hjchen2.github.io/tags/decision-tree/"},{"name":"决策树","slug":"决策树","permalink":"https://hjchen2.github.io/tags/%E5%86%B3%E7%AD%96%E6%A0%91/"},{"name":"HMM","slug":"HMM","permalink":"https://hjchen2.github.io/tags/HMM/"},{"name":"上下文相关音素","slug":"上下文相关音素","permalink":"https://hjchen2.github.io/tags/%E4%B8%8A%E4%B8%8B%E6%96%87%E7%9B%B8%E5%85%B3%E9%9F%B3%E7%B4%A0/"}]}],"categories":[{"name":"DL Compiler","slug":"DL-Compiler","permalink":"https://hjchen2.github.io/categories/DL-Compiler/"},{"name":"XRT","slug":"XRT","permalink":"https://hjchen2.github.io/categories/XRT/"},{"name":"tvm knowledge","slug":"tvm-knowledge","permalink":"https://hjchen2.github.io/categories/tvm-knowledge/"},{"name":"graph optimization, 图优化","slug":"graph-optimization-图优化","permalink":"https://hjchen2.github.io/categories/graph-optimization-%E5%9B%BE%E4%BC%98%E5%8C%96/"},{"name":"low bitwidth","slug":"low-bitwidth","permalink":"https://hjchen2.github.io/categories/low-bitwidth/"},{"name":"model compression","slug":"model-compression","permalink":"https://hjchen2.github.io/categories/model-compression/"},{"name":"neural machine translation","slug":"neural-machine-translation","permalink":"https://hjchen2.github.io/categories/neural-machine-translation/"},{"name":"ML framework","slug":"ML-framework","permalink":"https://hjchen2.github.io/categories/ML-framework/"},{"name":"code","slug":"code","permalink":"https://hjchen2.github.io/categories/code/"},{"name":"deep learning","slug":"deep-learning","permalink":"https://hjchen2.github.io/categories/deep-learning/"},{"name":"reinforcement learning","slug":"reinforcement-learning","permalink":"https://hjchen2.github.io/categories/reinforcement-learning/"},{"name":"Daily","slug":"Daily","permalink":"https://hjchen2.github.io/categories/Daily/"},{"name":"kaldi, decision tree, 决策树","slug":"kaldi-decision-tree-决策树","permalink":"https://hjchen2.github.io/categories/kaldi-decision-tree-%E5%86%B3%E7%AD%96%E6%A0%91/"}],"tags":[{"name":"Deep Learning Compiler","slug":"Deep-Learning-Compiler","permalink":"https://hjchen2.github.io/tags/Deep-Learning-Compiler/"},{"name":"IREE","slug":"IREE","permalink":"https://hjchen2.github.io/tags/IREE/"},{"name":"XRT","slug":"XRT","permalink":"https://hjchen2.github.io/tags/XRT/"},{"name":"Compiler","slug":"Compiler","permalink":"https://hjchen2.github.io/tags/Compiler/"},{"name":"TensorFlow XLA","slug":"TensorFlow-XLA","permalink":"https://hjchen2.github.io/tags/TensorFlow-XLA/"},{"name":"TensorRT","slug":"TensorRT","permalink":"https://hjchen2.github.io/tags/TensorRT/"},{"name":"TVM","slug":"TVM","permalink":"https://hjchen2.github.io/tags/TVM/"},{"name":"PackedFunc","slug":"PackedFunc","permalink":"https://hjchen2.github.io/tags/PackedFunc/"},{"name":"图替换","slug":"图替换","permalink":"https://hjchen2.github.io/tags/%E5%9B%BE%E6%9B%BF%E6%8D%A2/"},{"name":"超优化","slug":"超优化","permalink":"https://hjchen2.github.io/tags/%E8%B6%85%E4%BC%98%E5%8C%96/"},{"name":"graph optimization","slug":"graph-optimization","permalink":"https://hjchen2.github.io/tags/graph-optimization/"},{"name":"super optimization","slug":"super-optimization","permalink":"https://hjchen2.github.io/tags/super-optimization/"},{"name":"substitution","slug":"substitution","permalink":"https://hjchen2.github.io/tags/substitution/"},{"name":"XLA","slug":"XLA","permalink":"https://hjchen2.github.io/tags/XLA/"},{"name":"FusionStitching","slug":"FusionStitching","permalink":"https://hjchen2.github.io/tags/FusionStitching/"},{"name":"int16","slug":"int16","permalink":"https://hjchen2.github.io/tags/int16/"},{"name":"fp16","slug":"fp16","permalink":"https://hjchen2.github.io/tags/fp16/"},{"name":"混合精度训练","slug":"混合精度训练","permalink":"https://hjchen2.github.io/tags/%E6%B7%B7%E5%90%88%E7%B2%BE%E5%BA%A6%E8%AE%AD%E7%BB%83/"},{"name":"loss scaling","slug":"loss-scaling","permalink":"https://hjchen2.github.io/tags/loss-scaling/"},{"name":"QVNNI16","slug":"QVNNI16","permalink":"https://hjchen2.github.io/tags/QVNNI16/"},{"name":"pruning","slug":"pruning","permalink":"https://hjchen2.github.io/tags/pruning/"},{"name":"seq2seq","slug":"seq2seq","permalink":"https://hjchen2.github.io/tags/seq2seq/"},{"name":"machine translation","slug":"machine-translation","permalink":"https://hjchen2.github.io/tags/machine-translation/"},{"name":"Encoder-Decoder","slug":"Encoder-Decoder","permalink":"https://hjchen2.github.io/tags/Encoder-Decoder/"},{"name":"Attention","slug":"Attention","permalink":"https://hjchen2.github.io/tags/Attention/"},{"name":"large scale ML framework","slug":"large-scale-ML-framework","permalink":"https://hjchen2.github.io/tags/large-scale-ML-framework/"},{"name":"KunPeng","slug":"KunPeng","permalink":"https://hjchen2.github.io/tags/KunPeng/"},{"name":"c++","slug":"c","permalink":"https://hjchen2.github.io/tags/c/"},{"name":"python","slug":"python","permalink":"https://hjchen2.github.io/tags/python/"},{"name":"embedding","slug":"embedding","permalink":"https://hjchen2.github.io/tags/embedding/"},{"name":"caffe","slug":"caffe","permalink":"https://hjchen2.github.io/tags/caffe/"},{"name":"deep learning","slug":"deep-learning","permalink":"https://hjchen2.github.io/tags/deep-learning/"},{"name":"momentum","slug":"momentum","permalink":"https://hjchen2.github.io/tags/momentum/"},{"name":"reinforcement learning","slug":"reinforcement-learning","permalink":"https://hjchen2.github.io/tags/reinforcement-learning/"},{"name":"machine learning","slug":"machine-learning","permalink":"https://hjchen2.github.io/tags/machine-learning/"},{"name":"machine learning,贝尔曼公式推导","slug":"machine-learning,贝尔曼公式推导","permalink":"https://hjchen2.github.io/tags/machine-learning%EF%BC%8C%E8%B4%9D%E5%B0%94%E6%9B%BC%E5%85%AC%E5%BC%8F%E6%8E%A8%E5%AF%BC/"},{"name":"web technology","slug":"web-technology","permalink":"https://hjchen2.github.io/tags/web-technology/"},{"name":"framework","slug":"framework","permalink":"https://hjchen2.github.io/tags/framework/"},{"name":"kaldi","slug":"kaldi","permalink":"https://hjchen2.github.io/tags/kaldi/"},{"name":"decision tree","slug":"decision-tree","permalink":"https://hjchen2.github.io/tags/decision-tree/"},{"name":"决策树","slug":"决策树","permalink":"https://hjchen2.github.io/tags/%E5%86%B3%E7%AD%96%E6%A0%91/"},{"name":"HMM","slug":"HMM","permalink":"https://hjchen2.github.io/tags/HMM/"},{"name":"上下文相关音素","slug":"上下文相关音素","permalink":"https://hjchen2.github.io/tags/%E4%B8%8A%E4%B8%8B%E6%96%87%E7%9B%B8%E5%85%B3%E9%9F%B3%E7%B4%A0/"}]}