diff --git a/.gitignore b/.gitignore index cb385db..e9d8a05 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,28 @@ *.ptx *.cubin *.fatbin + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST diff --git a/README.md b/README.md index 520ac2c..d8e8d60 100644 --- a/README.md +++ b/README.md @@ -88,6 +88,26 @@ cd TileFusion && git submodule update --init --recursive TileFusion requires a C++20 host compiler, CUDA 12.0 or later, and GCC version 10.0 or higher to support C++20 features. +### Build from Source + +#### Using Makefile +To build the project using the provided `Makefile`, simply run: +```bash +make +``` + +#### Building the Python Wrapper + +1. Build the wheel: + ```bash + python3 setup.py build bdist_wheel + ``` + +2. Clean the build: + ```bash + python3 setup.py clean + ``` + ### Unit Test - **Run a single unit test**: `make unit_test UNIT_TEST=test_scatter_nd.py` diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 7bb9799..d8c3e1f 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -7,7 +7,7 @@ set(CMAKE_BUILD_TYPE Release) set(CMAKE_CXX_STANDARD 20 - CACHE STRING "The C++ standard whoese features are requested." FORCE) + CACHE STRING "The C++ standard whose features are requested." FORCE) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CUDA_STANDARD @@ -48,6 +48,12 @@ set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -std=c++20) set(CUDA_NVCC_FLAGS_DEBUG ${CUDA_NVCC_FLAGS_DEBUG} -std=c++20 -O0) set(CUDA_NVCC_FLAGS_RELEASE ${CUDA_NVCC_FLAGS_RELEASE} -std=c++20 -O3) +if(${CUDA_VERSION_MAJOR} VERSION_GREATER_EQUAL "11") + add_definitions("-DENABLE_BF16") + message(STATUS "CUDA_VERSION ${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR} " + "is greater or equal than 11.0, enable -DENABLE_BF16 flag.") +endif() + message(STATUS "tilefusion: CUDA detected: " ${CUDA_VERSION}) message(STATUS "tilefusion: CUDA nvcc is: " ${CUDA_NVCC_EXECUTABLE}) message(STATUS "tilefusion: CUDA toolkit directory: " ${CUDA_TOOLKIT_ROOT_DIR}) diff --git a/pyproject.toml b/pyproject.toml index 271a4c0..9945f48 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,11 +15,26 @@ classifiers = [ "Operating System :: OS Independent", "Topic :: Software Development :: Libraries", ] +# NOTE: setuptools's `install_requires` can overwritten in + # `pyproject.toml`'s `dependencies` field. + # Make sure to keep this field in sync with what is in `requirements.txt`. +dependencies = [ + "torch", +] [project.urls] Homepage = "https://github.com/microsoft/TileFusion" Issues = "https://github.com/microsoft/TileFusion/issues" +[build-system] +requires = [ + "cmake", + "packaging", + "setuptools>=49.4.0", + "wheel", +] +build-backend = "setuptools.build_meta" + [tool.ruff] line-length = 80 exclude = [ diff --git a/pytilefusion/__init__.py b/pytilefusion/__init__.py index 45c649b..90d3ce4 100644 --- a/pytilefusion/__init__.py +++ b/pytilefusion/__init__.py @@ -5,8 +5,6 @@ import torch -torch.ops.load_library("build/src/libtilefusion.so") - def scatter_nd(scatter_data, scatter_indices, scatter_updates): torch.ops.tilefusion.scatter_nd( diff --git a/pytilefusion/__version__.py b/pytilefusion/__version__.py new file mode 100644 index 0000000..c57bfd5 --- /dev/null +++ b/pytilefusion/__version__.py @@ -0,0 +1 @@ +__version__ = '0.0.0' diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..3fdbc00 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +cmake +packaging +setuptools>=49.4.0 +torch +wheel diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..4820566 --- /dev/null +++ b/setup.py @@ -0,0 +1,172 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- + +import os +import subprocess +from pathlib import Path + +from setuptools import Command, Extension, find_packages, setup +from setuptools.command.build_ext import build_ext + +cur_path = Path(__file__).parent + + +def get_requirements(): + """Get Python package dependencies from requirements.txt.""" + with open(cur_path / "requirements.txt") as f: + requirements = f.read().strip().split("\n") + requirements = [req for req in requirements if "https" not in req] + return requirements + + +class CMakeExtension(Extension): + """ specify the root folder of the CMake projects""" + + def __init__(self, name, cmake_lists_dir=".", **kwargs): + Extension.__init__(self, name, sources=[], **kwargs) + self.cmake_lists_dir = os.path.abspath(cmake_lists_dir) + + +class CMakeBuildExt(build_ext): + """launches the CMake build.""" + + def copy_extensions_to_source(self) -> None: + build_py = self.get_finalized_command("build_py") + for ext in self.extensions: + source_path = os.path.join(self.build_lib, "lib" + ext.name + ".so") + inplace_file, _ = self._get_inplace_equivalent(build_py, ext) + + target_path = os.path.join( + build_py.build_lib, "pytilefusion", inplace_file + ) + + # Always copy, even if source is older than destination, to ensure + # that the right extensions for the current Python/platform are + # used. + if os.path.exists(source_path) or not ext.optional: + self.copy_file(source_path, target_path, level=self.verbose) + + def build_extension(self, ext: CMakeExtension) -> None: + # Ensure that CMake is present and working + try: + subprocess.check_output(["cmake", "--version"]) + except OSError: + raise RuntimeError("Cannot find CMake executable") from None + + debug = int( + os.environ.get("DEBUG", 0) + ) if self.debug is None else self.debug + cfg = "Debug" if debug else "Release" + + parallel_level = os.environ.get("CMAKE_BUILD_PARALLEL_LEVEL", None) + if parallel_level is not None: + self.parallel = int(parallel_level) + else: + self.parallel = os.cpu_count() + + for ext in self.extensions: + extdir = os.path.abspath( + os.path.dirname(self.get_ext_fullpath(ext.name)) + ) + + cmake_args = [ + "-DCMAKE_BUILD_TYPE=%s" % cfg, + "-DCMAKE_LIBRARY_OUTPUT_DIRECTORY_{}={}".format( + cfg.upper(), extdir + ), "-DCMAKE_ARCHIVE_OUTPUT_DIRECTORY_{}={}".format( + cfg.upper(), self.build_temp + ) + ] + + # Adding CMake arguments set as environment variable + if "CMAKE_ARGS" in os.environ: + cmake_args += [ + item for item in os.environ["CMAKE_ARGS"].split(" ") if item + ] + + if not os.path.exists(self.build_temp): + os.makedirs(self.build_temp) + + build_args = [] + build_args += ["--config", cfg] + # Set CMAKE_BUILD_PARALLEL_LEVEL to control the parallel build level + # across all generators. + if ( + "CMAKE_BUILD_PARALLEL_LEVEL" not in os.environ and + hasattr(self, "parallel") and self.parallel + ): + build_args += [f"-j{self.parallel}"] + + build_temp = Path(self.build_temp) / ext.name + if not build_temp.exists(): + build_temp.mkdir(parents=True) + + # Config + subprocess.check_call(["cmake", ext.cmake_lists_dir] + cmake_args, + cwd=self.build_temp) + + # Build + subprocess.check_call(["cmake", "--build", "."] + build_args, + cwd=self.build_temp) + + print() + self.copy_extensions_to_source() + + +class Clean(Command): + user_options = [] + + def initialize_options(self): + pass + + def finalize_options(self): + pass + + def run(self): + import glob + import re + import shutil + + with open(".gitignore") as f: + ignores = f.read() + pat = re.compile(r"^#( BEGIN NOT-CLEAN-FILES )?") + for wildcard in filter(None, ignores.split("\n")): + match = pat.match(wildcard) + if match: + if match.group(1): + # Marker is found and stop reading .gitignore. + break + # Ignore lines which begin with '#'. + else: + # Don't remove absolute paths from the system + wildcard = wildcard.lstrip("./") + + for filename in glob.glob(wildcard): + print(f"cleaning '{filename}'") + try: + os.remove(filename) + except OSError: + shutil.rmtree(filename, ignore_errors=True) + + +description = ("PyTileFusion: A Python wrapper for tilefusion C++ library.") + +with open(os.path.join("pytilefusion", "__version__.py")) as f: + exec(f.read()) + +setup( + name="tilefusion", + version=__version__, # noqa F821 + description=description, + author="Ying Cao, Chengxiang Qi", + python_requires=">=3.10", + packages=find_packages(exclude=[""]), + install_requires=get_requirements(), + ext_modules=[CMakeExtension("tilefusion")], + cmdclass={ + "build_ext": CMakeBuildExt, + "clean": Clean, + }, +) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 27da770..6d51675 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -21,7 +21,15 @@ set_target_properties( CUDA_SEPARABLE_COMPILATION ON) target_compile_options( - ${TARGET} PUBLIC $<$: -Werror,-Wall -rdc=true - -std=c++20 -fconcepts -fpermissive>) + ${TARGET} + PUBLIC $<$: + -Werror,-Wall + -rdc=true + -std=c++20 + -fconcepts + -fpermissive + --use_fast_math + --generate-line-info + >) target_compile_features(${TARGET} PUBLIC cxx_std_20 cuda_std_20) target_link_libraries(${TARGET} "${TORCH_LIBRARIES}") diff --git a/tests/python/context.py b/tests/python/context.py deleted file mode 100644 index 98855c5..0000000 --- a/tests/python/context.py +++ /dev/null @@ -1,11 +0,0 @@ -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------------------- - -import os -import sys - -sys.path.insert( - 0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')) -) diff --git a/tests/python/test_flash_attn.py b/tests/python/test_flash_attn.py index d5f751c..5b525e4 100644 --- a/tests/python/test_flash_attn.py +++ b/tests/python/test_flash_attn.py @@ -5,7 +5,6 @@ import unittest -import context # noqa: F401 import torch from pytilefusion import TiledFlashAttention diff --git a/tests/python/test_scatter_nd.py b/tests/python/test_scatter_nd.py index 1b4402d..40caaad 100644 --- a/tests/python/test_scatter_nd.py +++ b/tests/python/test_scatter_nd.py @@ -6,7 +6,6 @@ import random import unittest -import context # noqa: F401 import torch from pytilefusion import scatter_nd