Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Frontend] Support auto schedule composition for PyTorch models #303

Merged
merged 7 commits into from
Feb 22, 2025

Conversation

chhzh123
Copy link
Member

Description

This PR automatically dispatches PyTorch operations to Allo kernels and applies pre-defined schedules.

Examples

import torch
import torch.nn.functional as F
import torch.nn as nn
import allo


class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear1 = torch.nn.Linear(16, 32)  # 8*16 * 32*16
        self.linear2 = torch.nn.Linear(32, 10)

    def forward(self, data):
        out = self.linear1(data)
        out = self.linear2(out)
        out = F.relu(out)
        return out


model = MLP()
model.eval()
example_inputs = [torch.rand(8, 16)]
llvm_mod = allo.frontend.from_pytorch(
    model, example_inputs=example_inputs, verbose=True
)
golden = model(*example_inputs)
np_inputs = [x.detach().numpy() for x in example_inputs]
res = llvm_mod(*np_inputs)
torch.testing.assert_close(res, golden.detach().numpy())
print("Passed!")
graph():
    %data : [num_users=1] = placeholder[target=data]
    %linear1 : [num_users=1] = call_module[target=linear1](args = (%data,), kwargs = {})
    %linear2 : [num_users=1] = call_module[target=linear2](args = (%linear1,), kwargs = {})
    %relu : [num_users=1] = call_function[target=torch.nn.functional.relu](args = (%linear2,), kwargs = {inplace: False})
    return relu

def forward(data: float32[8, 16]) -> (float32[8, 10]):
    linear1_weight: float32[32, 16] = g_linear1_weight
    linear1_bias: float32[32] = g_linear1_bias
    linear2_weight: float32[10, 32] = g_linear2_weight
    linear2_bias: float32[10] = g_linear2_bias
    linear1 = nn.linear[float32, 8, 32, 16, "1"](data, linear1_weight, linear1_bias)
    linear2 = nn.linear[float32, 8, 10, 32, "2"](linear1, linear2_weight, linear2_bias)
    relu = nn.relu[float32, 8, 10](linear2)
    return (relu)

module {
  memref.global "private" @linear1_weight ...
  func.func @linear_1(%arg0: memref<8x16xf32>, %arg1: memref<32x16xf32>, %arg2: memref<32xf32>) -> memref<8x32xf32> attributes {itypes = "___", otypes = "_"} {
    %alloc = memref.alloc() {name = "Z"} : memref<8x32xf32>
    %alloc_0 = memref.alloc() {name = "buf"} : memref<32xf32>
    affine.for %arg3 = 0 to 8 {
      affine.for %arg4 = 0 to 32 {
        %c0_i32 = arith.constant 0 : i32
        %0 = arith.sitofp %c0_i32 : i32 to f32
        affine.store %0, %alloc_0[%arg4] {to = "buf"} : memref<32xf32>
      } {loop_name = "j_init", op_name = "S_j_init_0", pipeline_ii = 1 : ui32}
      affine.for %arg4 = 0 to 16 {
        %0 = affine.load %arg0[%arg3, %arg4] {from = "X"} : memref<8x16xf32>
        %alloc_1 = memref.alloc() {name = "x"} : memref<f32>
        affine.store %0, %alloc_1[] {to = "x"} : memref<f32>
        affine.for %arg5 = 0 to 32 {
          %1 = affine.load %arg1[%arg5, %arg4] {from = "W"} : memref<32x16xf32>
          %2 = affine.load %alloc_1[] {from = "x"} : memref<f32>
          %3 = arith.mulf %2, %1 : f32
          %4 = affine.load %alloc_0[%arg5] {from = "buf"} : memref<32xf32>
          %5 = arith.addf %4, %3 : f32
          affine.store %5, %alloc_0[%arg5] {to = "buf"} : memref<32xf32>
        } {loop_name = "j", op_name = "S_j_1", pipeline_ii = 1 : ui32}
      } {loop_name = "k", op_name = "S_k_1"}
      affine.for %arg4 = 0 to 32 {
        %0 = affine.load %alloc_0[%arg4] {from = "buf"} : memref<32xf32>
        %1 = affine.load %arg2[%arg4] {from = "b"} : memref<32xf32>
        %2 = arith.addf %0, %1 : f32
        affine.store %2, %alloc[%arg3, %arg4] {to = "Z"} : memref<8x32xf32>
      } {loop_name = "j_back", op_name = "S_j_back_3", pipeline_ii = 1 : ui32}
    } {loop_name = "i", op_name = "S_i_0"}
    return %alloc : memref<8x32xf32>
  }
  func.func @linear_2(%arg0: memref<8x32xf32>, %arg1: memref<10x32xf32>, %arg2: memref<10xf32>) -> memref<8x10xf32> attributes {itypes = "___", otypes = "_"} {
    %alloc = memref.alloc() {name = "Z"} : memref<8x10xf32>
    %alloc_0 = memref.alloc() {name = "buf"} : memref<10xf32>
    affine.for %arg3 = 0 to 8 {
      affine.for %arg4 = 0 to 10 {
        %c0_i32 = arith.constant 0 : i32
        %0 = arith.sitofp %c0_i32 : i32 to f32
        affine.store %0, %alloc_0[%arg4] {to = "buf"} : memref<10xf32>
      } {loop_name = "j_init", op_name = "S_j_init_0", pipeline_ii = 1 : ui32}
      affine.for %arg4 = 0 to 32 {
        %0 = affine.load %arg0[%arg3, %arg4] {from = "X"} : memref<8x32xf32>
        %alloc_1 = memref.alloc() {name = "x"} : memref<f32>
        affine.store %0, %alloc_1[] {to = "x"} : memref<f32>
        affine.for %arg5 = 0 to 10 {
          %1 = affine.load %arg1[%arg5, %arg4] {from = "W"} : memref<10x32xf32>
          %2 = affine.load %alloc_1[] {from = "x"} : memref<f32>
          %3 = arith.mulf %2, %1 : f32
          %4 = affine.load %alloc_0[%arg5] {from = "buf"} : memref<10xf32>
          %5 = arith.addf %4, %3 : f32
          affine.store %5, %alloc_0[%arg5] {to = "buf"} : memref<10xf32>
        } {loop_name = "j", op_name = "S_j_1", pipeline_ii = 1 : ui32}
      } {loop_name = "k", op_name = "S_k_1"}
      affine.for %arg4 = 0 to 10 {
        %0 = affine.load %alloc_0[%arg4] {from = "buf"} : memref<10xf32>
        %1 = affine.load %arg2[%arg4] {from = "b"} : memref<10xf32>
        %2 = arith.addf %0, %1 : f32
        affine.store %2, %alloc[%arg3, %arg4] {to = "Z"} : memref<8x10xf32>
      } {loop_name = "j_back", op_name = "S_j_back_3", pipeline_ii = 1 : ui32}
    } {loop_name = "i", op_name = "S_i_0"}
    return %alloc : memref<8x10xf32>
  }
  func.func @relu(%arg0: memref<8x10xf32>) -> memref<8x10xf32> attributes {itypes = "_", otypes = "_"} {
    %alloc = memref.alloc() {name = "Z"} : memref<8x10xf32>
    affine.for %arg1 = 0 to 8 {
      affine.for %arg2 = 0 to 10 {
        %0 = affine.load %arg0[%arg1, %arg2] {from = "X"} : memref<8x10xf32>
        %cst = arith.constant 0.000000e+00 : f32
        %1 = arith.maximumf %cst, %0 : f32
        affine.store %1, %alloc[%arg1, %arg2] {to = "Z"} : memref<8x10xf32>
      } {loop_name = "j", pipeline_ii = 1 : ui32}
    } {loop_name = "i", op_name = "S_i_j_0"}
    return %alloc : memref<8x10xf32>
  }
  func.func @forward(%arg0: memref<8x16xf32>) -> memref<8x10xf32> attributes {itypes = "_", otypes = "_"} {
    %0 = memref.get_global @linear1_weight : memref<32x16xf32>
    %1 = memref.get_global @linear1_bias : memref<32xf32>
    %2 = memref.get_global @linear2_weight : memref<10x32xf32>
    %3 = memref.get_global @linear2_bias : memref<10xf32>
    %4 = call @linear_1(%arg0, %0, %1) {name = "linear1"} : (memref<8x16xf32>, memref<32x16xf32>, memref<32xf32>) -> memref<8x32xf32>
    %5 = call @linear_2(%4, %2, %3) {name = "linear2"} : (memref<8x32xf32>, memref<10x32xf32>, memref<10xf32>) -> memref<8x10xf32>
    %6 = call @relu(%5) {name = "relu"} : (memref<8x10xf32>) -> memref<8x10xf32>
    return %6 : memref<8x10xf32>
  }
}

Checklist

  • PR's title starts with a category (e.g. [Bugfix], [IR], [Builder], etc)
  • Changes are complete (i.e. I finished coding on this PR)
  • All changes have test coverage (It would be better to provide ~2 different test cases to test the robustness of your code)
  • Code is well-documented

@chhzh123 chhzh123 merged commit 4432a66 into cornell-zhang:main Feb 22, 2025
1 check passed
@chhzh123 chhzh123 deleted the pytorch_compose branch February 22, 2025 03:43
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

Successfully merging this pull request may close these issues.

1 participant