[Frontend] Support auto schedule composition for PyTorch models #303

chhzh123 · 2025-02-22T02:45:23Z

Description

This PR automatically dispatches PyTorch operations to Allo kernels and applies pre-defined schedules.

Examples

import torch
import torch.nn.functional as F
import torch.nn as nn
import allo


class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear1 = torch.nn.Linear(16, 32)  # 8*16 * 32*16
        self.linear2 = torch.nn.Linear(32, 10)

    def forward(self, data):
        out = self.linear1(data)
        out = self.linear2(out)
        out = F.relu(out)
        return out


model = MLP()
model.eval()
example_inputs = [torch.rand(8, 16)]
llvm_mod = allo.frontend.from_pytorch(
    model, example_inputs=example_inputs, verbose=True
)
golden = model(*example_inputs)
np_inputs = [x.detach().numpy() for x in example_inputs]
res = llvm_mod(*np_inputs)
torch.testing.assert_close(res, golden.detach().numpy())
print("Passed!")

graph():
    %data : [num_users=1] = placeholder[target=data]
    %linear1 : [num_users=1] = call_module[target=linear1](args = (%data,), kwargs = {})
    %linear2 : [num_users=1] = call_module[target=linear2](args = (%linear1,), kwargs = {})
    %relu : [num_users=1] = call_function[target=torch.nn.functional.relu](args = (%linear2,), kwargs = {inplace: False})
    return relu

def forward(data: float32[8, 16]) -> (float32[8, 10]):
    linear1_weight: float32[32, 16] = g_linear1_weight
    linear1_bias: float32[32] = g_linear1_bias
    linear2_weight: float32[10, 32] = g_linear2_weight
    linear2_bias: float32[10] = g_linear2_bias
    linear1 = nn.linear[float32, 8, 32, 16, "1"](data, linear1_weight, linear1_bias)
    linear2 = nn.linear[float32, 8, 10, 32, "2"](linear1, linear2_weight, linear2_bias)
    relu = nn.relu[float32, 8, 10](linear2)
    return (relu)

module {
  memref.global "private" @linear1_weight ...
  func.func @linear_1(%arg0: memref<8x16xf32>, %arg1: memref<32x16xf32>, %arg2: memref<32xf32>) -> memref<8x32xf32> attributes {itypes = "___", otypes = "_"} {
    %alloc = memref.alloc() {name = "Z"} : memref<8x32xf32>
    %alloc_0 = memref.alloc() {name = "buf"} : memref<32xf32>
    affine.for %arg3 = 0 to 8 {
      affine.for %arg4 = 0 to 32 {
        %c0_i32 = arith.constant 0 : i32
        %0 = arith.sitofp %c0_i32 : i32 to f32
        affine.store %0, %alloc_0[%arg4] {to = "buf"} : memref<32xf32>
      } {loop_name = "j_init", op_name = "S_j_init_0", pipeline_ii = 1 : ui32}
      affine.for %arg4 = 0 to 16 {
        %0 = affine.load %arg0[%arg3, %arg4] {from = "X"} : memref<8x16xf32>
        %alloc_1 = memref.alloc() {name = "x"} : memref<f32>
        affine.store %0, %alloc_1[] {to = "x"} : memref<f32>
        affine.for %arg5 = 0 to 32 {
          %1 = affine.load %arg1[%arg5, %arg4] {from = "W"} : memref<32x16xf32>
          %2 = affine.load %alloc_1[] {from = "x"} : memref<f32>
          %3 = arith.mulf %2, %1 : f32
          %4 = affine.load %alloc_0[%arg5] {from = "buf"} : memref<32xf32>
          %5 = arith.addf %4, %3 : f32
          affine.store %5, %alloc_0[%arg5] {to = "buf"} : memref<32xf32>
        } {loop_name = "j", op_name = "S_j_1", pipeline_ii = 1 : ui32}
      } {loop_name = "k", op_name = "S_k_1"}
      affine.for %arg4 = 0 to 32 {
        %0 = affine.load %alloc_0[%arg4] {from = "buf"} : memref<32xf32>
        %1 = affine.load %arg2[%arg4] {from = "b"} : memref<32xf32>
        %2 = arith.addf %0, %1 : f32
        affine.store %2, %alloc[%arg3, %arg4] {to = "Z"} : memref<8x32xf32>
      } {loop_name = "j_back", op_name = "S_j_back_3", pipeline_ii = 1 : ui32}
    } {loop_name = "i", op_name = "S_i_0"}
    return %alloc : memref<8x32xf32>
  }
  func.func @linear_2(%arg0: memref<8x32xf32>, %arg1: memref<10x32xf32>, %arg2: memref<10xf32>) -> memref<8x10xf32> attributes {itypes = "___", otypes = "_"} {
    %alloc = memref.alloc() {name = "Z"} : memref<8x10xf32>
    %alloc_0 = memref.alloc() {name = "buf"} : memref<10xf32>
    affine.for %arg3 = 0 to 8 {
      affine.for %arg4 = 0 to 10 {
        %c0_i32 = arith.constant 0 : i32
        %0 = arith.sitofp %c0_i32 : i32 to f32
        affine.store %0, %alloc_0[%arg4] {to = "buf"} : memref<10xf32>
      } {loop_name = "j_init", op_name = "S_j_init_0", pipeline_ii = 1 : ui32}
      affine.for %arg4 = 0 to 32 {
        %0 = affine.load %arg0[%arg3, %arg4] {from = "X"} : memref<8x32xf32>
        %alloc_1 = memref.alloc() {name = "x"} : memref<f32>
        affine.store %0, %alloc_1[] {to = "x"} : memref<f32>
        affine.for %arg5 = 0 to 10 {
          %1 = affine.load %arg1[%arg5, %arg4] {from = "W"} : memref<10x32xf32>
          %2 = affine.load %alloc_1[] {from = "x"} : memref<f32>
          %3 = arith.mulf %2, %1 : f32
          %4 = affine.load %alloc_0[%arg5] {from = "buf"} : memref<10xf32>
          %5 = arith.addf %4, %3 : f32
          affine.store %5, %alloc_0[%arg5] {to = "buf"} : memref<10xf32>
        } {loop_name = "j", op_name = "S_j_1", pipeline_ii = 1 : ui32}
      } {loop_name = "k", op_name = "S_k_1"}
      affine.for %arg4 = 0 to 10 {
        %0 = affine.load %alloc_0[%arg4] {from = "buf"} : memref<10xf32>
        %1 = affine.load %arg2[%arg4] {from = "b"} : memref<10xf32>
        %2 = arith.addf %0, %1 : f32
        affine.store %2, %alloc[%arg3, %arg4] {to = "Z"} : memref<8x10xf32>
      } {loop_name = "j_back", op_name = "S_j_back_3", pipeline_ii = 1 : ui32}
    } {loop_name = "i", op_name = "S_i_0"}
    return %alloc : memref<8x10xf32>
  }
  func.func @relu(%arg0: memref<8x10xf32>) -> memref<8x10xf32> attributes {itypes = "_", otypes = "_"} {
    %alloc = memref.alloc() {name = "Z"} : memref<8x10xf32>
    affine.for %arg1 = 0 to 8 {
      affine.for %arg2 = 0 to 10 {
        %0 = affine.load %arg0[%arg1, %arg2] {from = "X"} : memref<8x10xf32>
        %cst = arith.constant 0.000000e+00 : f32
        %1 = arith.maximumf %cst, %0 : f32
        affine.store %1, %alloc[%arg1, %arg2] {to = "Z"} : memref<8x10xf32>
      } {loop_name = "j", pipeline_ii = 1 : ui32}
    } {loop_name = "i", op_name = "S_i_j_0"}
    return %alloc : memref<8x10xf32>
  }
  func.func @forward(%arg0: memref<8x16xf32>) -> memref<8x10xf32> attributes {itypes = "_", otypes = "_"} {
    %0 = memref.get_global @linear1_weight : memref<32x16xf32>
    %1 = memref.get_global @linear1_bias : memref<32xf32>
    %2 = memref.get_global @linear2_weight : memref<10x32xf32>
    %3 = memref.get_global @linear2_bias : memref<10xf32>
    %4 = call @linear_1(%arg0, %0, %1) {name = "linear1"} : (memref<8x16xf32>, memref<32x16xf32>, memref<32xf32>) -> memref<8x32xf32>
    %5 = call @linear_2(%4, %2, %3) {name = "linear2"} : (memref<8x32xf32>, memref<10x32xf32>, memref<10xf32>) -> memref<8x10xf32>
    %6 = call @relu(%5) {name = "relu"} : (memref<8x10xf32>) -> memref<8x10xf32>
    return %6 : memref<8x10xf32>
  }
}

Checklist

PR's title starts with a category (e.g. [Bugfix], [IR], [Builder], etc)
Changes are complete (i.e. I finished coding on this PR)
All changes have test coverage (It would be better to provide ~2 different test cases to test the robustness of your code)
Code is well-documented

chhzh123 added 7 commits February 21, 2025 20:27

Fix file_name

6724bcc

Add nn.linear

fcbc113

Fix function call

d1ecc01

Add linear composition

e120406

Add relu

5115cbb

Fix pylint

fd12fa1

Fix pytest

503dc02

chhzh123 merged commit 4432a66 into cornell-zhang:main Feb 22, 2025
1 check passed

chhzh123 deleted the pytorch_compose branch February 22, 2025 03:43

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[Frontend] Support auto schedule composition for PyTorch models #303

[Frontend] Support auto schedule composition for PyTorch models #303

chhzh123 commented Feb 22, 2025

[Frontend] Support auto schedule composition for PyTorch models #303

[Frontend] Support auto schedule composition for PyTorch models #303

Conversation

chhzh123 commented Feb 22, 2025

Description

Examples

Checklist