Skip to content

Commit

Permalink
Liqun/havenka/rel 1.8.1 round3 (#8246)
Browse files Browse the repository at this point in the history
* Revert the cuda algo finding change as this causes a significant memory bloat. (#8181)

* Revert the cuda algo finding change as this causes a significant memory bloat.

* Address PR comment

* Make pipelines to support torch1.8.1 and torch1.9.0 (#8084)

* Add post-install command to build PyTorch CPP extensions from within onnxruntime package (#8027)

ORTModule requires two PyTorch CPP extensions that are currently JIT compiled. The runtime compilation can cause issues in some environments without all build requirements or in environments with multiple instances of ORTModule running in parallel

This PR creates a custom command to compile such extensions that must be manually executed before ORTModule is executed for the first time. When users try to use ORTModule before the extensions are compiled, an error with instructions are raised

PyTorch CPP Extensions for ORTModule can be compiled by running:
python -m onnxruntime.training.ortmodule.torch_cpp_extensions.install

Full build environment is needed for this

* Patch orttraining-ortmodule pipeline with latest fix on master

* add cuda version to build config

* lib path

* .

* .

* .

* .

* .

* .

* .

* .

* .

* .

* .

* Remove auto doc gen

Co-authored-by: Pranav Sharma <prs@microsoft.com>
Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com>
Co-authored-by: Baiju Meswani <bmeswani@microsoft.com>
  • Loading branch information
4 people authored Jul 1, 2021
1 parent 6057515 commit 96bb4b1
Show file tree
Hide file tree
Showing 7 changed files with 21 additions and 94 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ stages:
build_py_parameters: --enable_training --update --build
torch_version: '1.8.1'
cuda_version: '11.1'
gcc_version: 9
gcc_version: 8
cmake_cuda_architectures: 37;50;52;60;61;70;75;80
docker_file: Dockerfile.manylinux2014_training_cuda11_1
agent_pool: Onnxruntime-Linux-GPU
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ stages:
build_py_parameters: --enable_training --update --build
torch_version: '1.9.0'
cuda_version: '11.1'
gcc_version: 9
gcc_version: 8
cmake_cuda_architectures: 37;50;52;60;61;70;75;80
docker_file: Dockerfile.manylinux2014_training_cuda11_1
agent_pool: Onnxruntime-Linux-GPU
Original file line number Diff line number Diff line change
Expand Up @@ -117,32 +117,6 @@ stages:
Contents: 'Release/dist/*.whl'
TargetFolder: '$(Build.ArtifactStagingDirectory)'

- task: CmdLine@2
displayName: 'Build Python Documentation'
condition: and(succeeded(), ne(variables['PythonVersion'], '3.9')) # tensorflow not available on python 3.9
inputs:
script: |
mkdir -p $HOME/.onnx
docker run --rm \
--volume /data/onnx:/data/onnx:ro \
--volume $(Build.SourcesDirectory):/onnxruntime_src \
--volume $(Build.BinariesDirectory):/build \
--volume /data/models:/build/models:ro \
--volume $HOME/.onnx:/home/onnxruntimedev/.onnx \
-e NIGHTLY_BUILD \
-e BUILD_BUILDNUMBER \
onnxruntimecpubuild \
bash -c " $(PythonManylinuxDir)/bin/python3 -m pip install /build/Release/dist/*.whl && $(PythonManylinuxDir)/bin/python3 -m onnxruntime.training.ortmodule.torch_cpp_extensions.install && /onnxruntime_src/tools/doc/builddoc.sh $(PythonManylinuxDir)/bin/ /onnxruntime_src /build Release " ;
workingDirectory: $(Build.SourcesDirectory)

- task: CopyFiles@2
displayName: 'Copy Python Documentation to: $(Build.ArtifactStagingDirectory)'
condition: and(succeeded(), ne(variables['PythonVersion'], '3.9')) # tensorflow not available on python 3.9
inputs:
SourceFolder: '$(Build.BinariesDirectory)/docs/inference/html'
Contents: '**'
TargetFolder: '$(Build.ArtifactStagingDirectory)/inference_html_doc'

- task: PublishBuildArtifacts@1
displayName: 'Publish Artifact: ONNXRuntime python wheel and documentation'
inputs:
Expand Down Expand Up @@ -329,7 +303,7 @@ stages:
render_gid=$(getent group | awk '/render/ {split($0,a,":"); print(a[3])}')
echo "##vso[task.setvariable variable=render]$render_gid"
displayName: 'Find video and render gid to be mapped into container'
- script: |-
echo "video=$video"
echo "render=$render"
Expand All @@ -354,7 +328,7 @@ stages:
onnxruntimetrainingrocmbuild \
/onnxruntime_src/tools/ci_build/github/pai/pai_test_launcher.sh
displayName: 'Run onnxruntime unit tests (in container)'
- script: |-
docker run --rm \
--device=/dev/kfd \
Expand All @@ -381,7 +355,7 @@ stages:
--gpu_sku MI100_32G
displayName: 'Run C++ BERT-L batch size test (in container)'
condition: succeededOrFailed() # ensure all tests are run
- script: |-
docker run --rm \
--device=/dev/kfd \
Expand Down Expand Up @@ -409,7 +383,7 @@ stages:
--gpu_sku MI100_32G
displayName: 'Run C++ BERT-L performance test (in container)'
condition: succeededOrFailed() # ensure all tests are run
- script: |-
docker run --rm \
--device=/dev/kfd \
Expand Down Expand Up @@ -437,38 +411,14 @@ stages:
--gpu_sku MI100_32G
displayName: 'Run C++ BERT-L convergence test (in container)'
condition: succeededOrFailed() # ensure all tests are run
- task: CopyFiles@2
displayName: 'Copy Python Wheel to: $(Build.ArtifactStagingDirectory)'
inputs:
SourceFolder: '$(Build.BinariesDirectory)'
Contents: 'Release/dist/*.whl'
TargetFolder: '$(Build.ArtifactStagingDirectory)'

- task: CmdLine@2
displayName: 'Build Python Documentation'
condition: and(succeeded(), ne(variables['PythonVersion'], '3.9')) # tensorflow not available on python 3.9
inputs:
script: |
mkdir -p $HOME/.onnx
docker run --rm \
--volume $(Build.SourcesDirectory):/onnxruntime_src \
--volume $(Build.BinariesDirectory):/build \
-e NIGHTLY_BUILD \
-e BUILD_BUILDNUMBER \
--entrypoint /bin/bash \
onnxruntimetrainingrocmbuild \
/onnxruntime_src/tools/doc/builddoc.sh $(PythonManylinuxDir)/bin/ /onnxruntime_src /build Release
workingDirectory: $(Build.SourcesDirectory)

- task: CopyFiles@2
displayName: 'Copy Python Documentation to: $(Build.ArtifactStagingDirectory)'
condition: and(succeeded(), ne(variables['PythonVersion'], '3.9')) # tensorflow not available on python 3.9
inputs:
SourceFolder: '$(Build.BinariesDirectory)/docs/training/html'
Contents: '**'
TargetFolder: '$(Build.ArtifactStagingDirectory)/training_html_doc'

- task: PublishBuildArtifacts@1
displayName: 'Upload Rocm wheel as build artifact'
inputs:
Expand Down Expand Up @@ -737,7 +687,7 @@ stages:
displayName: 'Publish Artifact: ONNXRuntime python wheel'
inputs:
ArtifactName: onnxruntime_gpu

- task: DeleteFiles@1
displayName: 'Delete files from $(Build.BinariesDirectory)\RelWithDebInfo'
condition: and (succeeded(), eq(variables['PythonVersion'], '3.7'))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ parameters:
displayName: >
gcc_version.
type: number

- name: docker_file
displayName: >
docker_file.
Expand Down Expand Up @@ -87,9 +87,9 @@ stages:
--build-arg INSTALL_DEPS_EXTRA_ARGS=-tu
--build-arg BUILD_UID=$(id -u)
--network=host --build-arg POLICY=manylinux2014 --build-arg PLATFORM=x86_64
--build-arg DEVTOOLSET_ROOTPATH=/opt/rh/devtoolset-$(GccVersion)/root
--build-arg PREPEND_PATH=/opt/rh/devtoolset-$(GccVersion)/root/usr/bin:
--build-arg LD_LIBRARY_PATH_ARG=/opt/rh/devtoolset-$(GccVersion)/root/usr/lib64:/opt/rh/devtoolset-$(GccVersion)/root/usr/lib:/opt/rh/devtoolset-$(GccVersion)/root/usr/lib64/dyninst:/opt/rh/devtoolset-$(GccVersion)/root/usr/lib/dyninst:/usr/local/lib64
--build-arg DEVTOOLSET_ROOTPATH=/opt/rh/devtoolset-$(GccVersion)/root
--build-arg PREPEND_PATH=/opt/rh/devtoolset-$(GccVersion)/root/usr/bin:
--build-arg LD_LIBRARY_PATH_ARG=$(PythonManylinuxLibDir):/opt/rh/devtoolset-$(GccVersion)/root/usr/lib64:/opt/rh/devtoolset-$(GccVersion)/root/usr/lib:/opt/rh/devtoolset-$(GccVersion)/root/usr/lib64/dyninst:/opt/rh/devtoolset-$(GccVersion)/root/usr/lib/dyninst:/usr/local/lib64
Repository: onnxruntimetraininggpubuild

- bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdata-storage-key) -s "//orttrainingtestdata.file.core.windows.net/mnist" -d "/mnist"
Expand Down Expand Up @@ -128,7 +128,7 @@ stages:
--build_wheel \
--enable_onnx_tests \
${{ parameters.build_py_parameters }} \
--cmake_extra_defines CMAKE_CUDA_HOST_COMPILER=/opt/rh/devtoolset-$(GccVersion)/root/usr/bin/cc 'CMAKE_CUDA_ARCHITECTURES=${{ parameters.cmake_cuda_architectures }}' \
--cmake_extra_defines PYTHON_INCLUDE_DIR=$(PythonManylinuxIncludeDir) PYTHON_LIBRARY=$(PythonManylinuxLibDir) CMAKE_CUDA_HOST_COMPILER=/opt/rh/devtoolset-$(GccVersion)/root/usr/bin/cc 'CMAKE_CUDA_ARCHITECTURES=${{ parameters.cmake_cuda_architectures }}' \
--use_cuda --cuda_version=$(CudaVersion) --cuda_home=/usr/local/cuda-$(CudaVersion) --cudnn_home=/usr/local/cuda-$(CudaVersion) ;
workingDirectory: $(Build.SourcesDirectory)

Expand Down Expand Up @@ -159,34 +159,6 @@ stages:
Contents: 'Release/dist/*.whl'
TargetFolder: '$(Build.ArtifactStagingDirectory)'

- task: CmdLine@2
displayName: 'Build Python Documentation'
condition: and(succeeded(), ne(variables['PythonVersion'], '3.9')) # tensorflow not available on python 3.9
inputs:
script: |
mkdir -p $HOME/.onnx
docker run --rm \
--gpus all \
-e NVIDIA_VISIBLE_DEVICES=all \
--volume /data/onnx:/data/onnx:ro \
--volume $(Build.SourcesDirectory):/onnxruntime_src \
--volume $(Build.BinariesDirectory):/build \
--volume /data/models:/build/models:ro \
--volume $HOME/.onnx:/home/onnxruntimedev/.onnx \
-e NIGHTLY_BUILD \
-e BUILD_BUILDNUMBER \
onnxruntimetraininggpubuild \
bash -c " $(PythonManylinuxDir)/bin/python3 -m pip install /build/Release/dist/*.whl && $(PythonManylinuxDir)/bin/python3 -m onnxruntime.training.ortmodule.torch_cpp_extensions.install && /onnxruntime_src/tools/doc/builddoc.sh $(PythonManylinuxDir)/bin/ /onnxruntime_src /build Release " ;
workingDirectory: $(Build.SourcesDirectory)

- task: CopyFiles@2
displayName: 'Copy Python Documentation to: $(Build.ArtifactStagingDirectory)'
condition: and(succeeded(), ne(variables['PythonVersion'], '3.9')) # tensorflow not available on python 3.9
inputs:
SourceFolder: '$(Build.BinariesDirectory)/docs/training/html'
Contents: '**'
TargetFolder: '$(Build.ArtifactStagingDirectory)/training_html_doc'

- task: PublishBuildArtifacts@1
displayName: 'Publish Artifact: ONNXRuntime python wheel and documentation'
inputs:
Expand All @@ -207,7 +179,7 @@ stages:
--account_key $(orttrainingpackagestorageaccountkey) \
--container_name '$web'
condition: succeededOrFailed()
displayName:
displayName:

- template: component-governance-component-detection-steps.yml
parameters:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,26 +13,31 @@ steps:
variables = {
"PythonManylinuxDir": "/opt/python/cp35-cp35m",
"PythonManylinuxIncludeDir": "/opt/python/cp35-cp35m/include/python3.5m",
"PythonManylinuxLibDir": "/opt/python/cp35-cp35m/lib/python3.5",
}
elif version == "3.6":
variables = {
"PythonManylinuxDir": "/opt/python/cp36-cp36m",
"PythonManylinuxIncludeDir": "/opt/python/cp36-cp36m/include/python3.6m",
"PythonManylinuxLibDir": "/opt/python/cp36-cp36m/lib/python3.6",
}
elif version == "3.7":
variables = {
"PythonManylinuxDir": "/opt/python/cp37-cp37m",
"PythonManylinuxIncludeDir": "/opt/python/cp37-cp37m/include/python3.7m",
"PythonManylinuxLibDir": "/opt/python/cp37-cp37m/lib/python3.7",
}
elif version == "3.8":
variables = {
"PythonManylinuxDir": "/opt/python/cp38-cp38",
"PythonManylinuxIncludeDir": "/opt/python/cp38-cp38/include/python3.8",
"PythonManylinuxLibDir": "/opt/python/cp38-cp38/lib/python3.8",
}
elif version == "3.9":
variables = {
"PythonManylinuxDir": "/opt/python/cp39-cp39",
"PythonManylinuxIncludeDir": "/opt/python/cp39-cp39/include/python3.9",
"PythonManylinuxLibDir": "/opt/python/cp39-cp39/lib/python3.9",
}
else:
raise ValueError("Unsupported Python version: '{}'".format(version))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ ENV LANG en_US.UTF-8
ENV LANGUAGE en_US.UTF-8
ENV DEVTOOLSET_ROOTPATH /opt/rh/devtoolset-8/root
ENV PATH $DEVTOOLSET_ROOTPATH/usr/bin:$PATH
ENV LD_LIBRARY_PATH $DEVTOOLSET_ROOTPATH/usr/lib64:$DEVTOOLSET_ROOTPATH/usr/lib:$DEVTOOLSET_ROOTPATH/usr/lib64/dyninst:$DEVTOOLSET_ROOTPATH/usr/lib/dyninst:/usr/local/lib64:/usr/local/lib
ENV LD_LIBRARY_PATH ${LD_LIBRARY_PATH_ARG}:$DEVTOOLSET_ROOTPATH/usr/lib64:$DEVTOOLSET_ROOTPATH/usr/lib:$DEVTOOLSET_ROOTPATH/usr/lib64/dyninst:$DEVTOOLSET_ROOTPATH/usr/lib/dyninst:/usr/local/lib64:/usr/local/lib
ENV PKG_CONFIG_PATH /usr/local/lib/pkgconfig

COPY manylinux2014_build_scripts /manylinux2014_build_scripts
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ ENV LANG en_US.UTF-8
ENV LANGUAGE en_US.UTF-8
ENV DEVTOOLSET_ROOTPATH /opt/rh/devtoolset-8/root
ENV PATH $DEVTOOLSET_ROOTPATH/usr/bin:$PATH
ENV LD_LIBRARY_PATH $DEVTOOLSET_ROOTPATH/usr/lib64:$DEVTOOLSET_ROOTPATH/usr/lib:$DEVTOOLSET_ROOTPATH/usr/lib64/dyninst:$DEVTOOLSET_ROOTPATH/usr/lib/dyninst:/usr/local/lib64:/usr/local/lib
ENV LD_LIBRARY_PATH ${LD_LIBRARY_PATH_ARG}:$DEVTOOLSET_ROOTPATH/usr/lib64:$DEVTOOLSET_ROOTPATH/usr/lib:$DEVTOOLSET_ROOTPATH/usr/lib64/dyninst:$DEVTOOLSET_ROOTPATH/usr/lib/dyninst:/usr/local/lib64:/usr/local/lib
ENV PKG_CONFIG_PATH /usr/local/lib/pkgconfig

COPY manylinux2014_build_scripts /manylinux2014_build_scripts
Expand Down

0 comments on commit 96bb4b1

Please sign in to comment.