From c13e79a78365b0b0de48bbe7f932923c8e90688b Mon Sep 17 00:00:00 2001 From: pytorchbot Date: Fri, 5 Apr 2024 11:34:57 +0000 Subject: [PATCH] 2024-04-05 nightly release (e5a8de09a67d3e9ac8c8d67e47b62dec30b4b84d) --- .github/workflows/apple.yml | 11 ++ .swift/coreml_backend_debug/dummy.swift | 0 .swift/custom_backend_debug/dummy.swift | 0 .swift/executorch_debug/dummy.swift | 0 .swift/mps_backend_debug/dummy.swift | 0 .swift/optimized_backend_debug/dummy.swift | 0 .swift/portable_backend_debug/dummy.swift | 0 .swift/quantized_backend_debug/dummy.swift | 0 .swift/xnnpack_backend_debug/dummy.swift | 0 CMakeLists.txt | 5 + Package.swift | 152 ++++++++-------- docs/source/getting-started-setup.md | 5 + examples/apple/coreml/scripts/export.py | 21 +-- .../executorchllamademo/MainActivity.java | 27 ++- .../LLaMA/LLaMA.xcodeproj/project.pbxproj | 170 ++++++++++++------ examples/models/llama2/README.md | 94 ++++++++-- examples/models/llama2/eval_llama_lib.py | 26 +-- examples/models/llama2/export_llama_lib.py | 31 +++- examples/models/llama2/runner/runner.cpp | 110 +++++++----- examples/models/llama2/runner/runner.h | 60 +++---- examples/models/llama2/runner/targets.bzl | 2 +- exir/serde/export_serialize.py | 2 +- extension/android/jni/jni_layer_llama.cpp | 19 +- .../org/pytorch/executorch/LlamaCallback.java | 15 +- .../org/pytorch/executorch/LlamaModule.java | 9 + 25 files changed, 484 insertions(+), 275 deletions(-) create mode 100644 .swift/coreml_backend_debug/dummy.swift create mode 100644 .swift/custom_backend_debug/dummy.swift create mode 100644 .swift/executorch_debug/dummy.swift create mode 100644 .swift/mps_backend_debug/dummy.swift create mode 100644 .swift/optimized_backend_debug/dummy.swift create mode 100644 .swift/portable_backend_debug/dummy.swift create mode 100644 .swift/quantized_backend_debug/dummy.swift create mode 100644 .swift/xnnpack_backend_debug/dummy.swift diff --git a/.github/workflows/apple.yml b/.github/workflows/apple.yml index 6a8c092575..c5ca167afb 100644 --- a/.github/workflows/apple.yml +++ b/.github/workflows/apple.yml @@ -100,6 +100,17 @@ jobs: zip -r "${RUNNER_TEMP}/artifacts/${FRAMEWORK}-${VERSION}.zip" "${FRAMEWORK}.xcframework" ) done + # Build Debug iOS Frameworks + PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \ + build/build_apple_frameworks.sh --coreml --custom --mps --optimized --portable --quantized --xnnpack --Debug + + # Bundle Debug iOS Frameworks + for FRAMEWORK in "${FRAMEWORKS[@]}"; do ( + cd cmake-out && \ + mv "${FRAMEWORK}.xcframework" "${FRAMEWORK}_debug.xcframework" && \ + zip -r "${RUNNER_TEMP}/artifacts/${FRAMEWORK}_debug-${VERSION}.zip" "${FRAMEWORK}_debug.xcframework" + ) done + popd upload-frameworks-ios: diff --git a/.swift/coreml_backend_debug/dummy.swift b/.swift/coreml_backend_debug/dummy.swift new file mode 100644 index 0000000000..e69de29bb2 diff --git a/.swift/custom_backend_debug/dummy.swift b/.swift/custom_backend_debug/dummy.swift new file mode 100644 index 0000000000..e69de29bb2 diff --git a/.swift/executorch_debug/dummy.swift b/.swift/executorch_debug/dummy.swift new file mode 100644 index 0000000000..e69de29bb2 diff --git a/.swift/mps_backend_debug/dummy.swift b/.swift/mps_backend_debug/dummy.swift new file mode 100644 index 0000000000..e69de29bb2 diff --git a/.swift/optimized_backend_debug/dummy.swift b/.swift/optimized_backend_debug/dummy.swift new file mode 100644 index 0000000000..e69de29bb2 diff --git a/.swift/portable_backend_debug/dummy.swift b/.swift/portable_backend_debug/dummy.swift new file mode 100644 index 0000000000..e69de29bb2 diff --git a/.swift/quantized_backend_debug/dummy.swift b/.swift/quantized_backend_debug/dummy.swift new file mode 100644 index 0000000000..e69de29bb2 diff --git a/.swift/xnnpack_backend_debug/dummy.swift b/.swift/xnnpack_backend_debug/dummy.swift new file mode 100644 index 0000000000..e69de29bb2 diff --git a/CMakeLists.txt b/CMakeLists.txt index caf9959d7b..46b73f6349 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -224,6 +224,11 @@ if(EXECUTORCH_BUILD_PTHREADPOOL) set(PTHREADPOOL_ALLOW_DEPRECATED_API ON CACHE BOOL "") + if(APPLE) + set(PTHREADPOOL_SYNC_PRIMITIVE + "condvar" + CACHE STRING "") + endif() add_subdirectory("${PTHREADPOOL_SOURCE_DIR}") endif() diff --git a/Package.swift b/Package.swift index d62af6fb0f..ce256e68e8 100644 --- a/Package.swift +++ b/Package.swift @@ -9,95 +9,93 @@ import PackageDescription -let url = "https://ossci-ios.s3.amazonaws.com/executorch" let version = "0.1.0" -let coreml_sha256 = "78d853d87be478696e56e658aa4ff17d47ae185a9a6a36316c821fa8b2d3aacd" -let custom_sha256 = "f059f6716298403dff89a952a70e323c54911be140d05f2467bd5cc61aaefae3" -let executorch_sha256 = "ba9a0c2b061afaedbc3c5454040a598b1371170bd9d9a30b7163c20e23339841" -let mps_sha256 = "39542a8671cca1aa627102aa47785d0f6e2dfe9a40e2c22288a755057b00fbfa" -let optimized_sha256 = "1d84fa16197bb6f0dec01aaa29d2a140c0e14d8e5e92630a7b4dd6f48012506d" -let portable_sha256 = "4993904f89ecb4476677ff3c072ed1a314a608170f10d364cfd23947851ccbf3" -let quantized_sha256 = "8d35ee0e7ca77c19782eaea07a1888f576cda679f8a4a5edb03d80ebe858047e" -let xnnpack_sha256 = "380e5185c4c48ede7cc0d0f0657ffb26df83cd9f55813d78593aea8a93942caf" - -struct Framework { - let name: String - let checksum: String - var frameworks: [String] = [] - var libraries: [String] = [] - - func target() -> Target { - .binaryTarget( - name: name, - url: "\(url)/\(name)-\(version).zip", - checksum: checksum - ) - } - - func dependencies() -> Target { - .target( - name: "\(name)_dependencies", - dependencies: [.target(name: name)], - path: ".swift/\(name)", - linkerSettings: - frameworks.map { .linkedFramework($0) } + - libraries.map { .linkedLibrary($0) } - ) - } -} - -let frameworks = [ - Framework( - name: "coreml_backend", - checksum: coreml_sha256, - frameworks: [ +let url = "https://ossci-ios.s3.amazonaws.com/executorch/" +let debug = "_debug" +let deliverables = [ + "coreml_backend": [ + "sha256": "0e5973bbc547e3a39f988f9a7a68b47bda0a6a17b04516fff6957fd527f8cd48", + "sha256" + debug: "c63773f0098625f884fecb11b4a5f6318b97d566329fef8b013444829cd7c421", + "frameworks": [ "Accelerate", "CoreML", ], - libraries: [ + "libraries": [ "sqlite3", - ] - ), - Framework( - name: "custom_backend", - checksum: custom_sha256 - ), - Framework( - name: "executorch", - checksum: executorch_sha256 - ), - Framework( - name: "mps_backend", - checksum: mps_sha256, - frameworks: [ + ], + ], + "custom_backend": [ + "sha256": "c8405e21324262cd6590046096ddeb3ac33a598f88afc817a2f2fdee821da150", + "sha256" + debug: "a08a6aa15ddce61a76cd1bf2206d017cc4ac7dcb9ca312ad7750a36814448eaa", + ], + "executorch": [ + "sha256": "57269f9b81d56a3d96ece2012e2ece3af24174846abd98de9a3bee07f3b9583d", + "sha256" + debug: "66975caf3d9c1238d29945288f23ddb6e07e16386d4dedf429c0f2d81cfbe0cc", + ], + "mps_backend": [ + "sha256": "bb7531172252b6535429fbde429de208665f933d0f509982872eada86839e734", + "sha256" + debug: "6d41437e40cb794b4b7a0d971931773de263370463b38a014f38e99bd1c5d52b", + "frameworks": [ "Metal", "MetalPerformanceShaders", "MetalPerformanceShadersGraph", - ] - ), - Framework( - name: "optimized_backend", - checksum: optimized_sha256 - ), - Framework( - name: "portable_backend", - checksum: portable_sha256 - ), - Framework( - name: "quantized_backend", - checksum: quantized_sha256 - ), - Framework( - name: "xnnpack_backend", - checksum: xnnpack_sha256 - ) -] + ], + ], + "optimized_backend": [ + "sha256": "bdab593fb49c9000291dbf691ad578d771883745ed2851f00492e828d089d1ea", + "sha256" + debug: "8316ad259d6aafecf2e9abc91a04fc1fa3e0398597e043119b4c29c21e9f2029", + ], + "portable_backend": [ + "sha256": "38ebdad7d5cd24ca44cd950d561dcf9a9b883dff626c167bc6f5f28f041b8406", + "sha256" + debug: "9e68b3e92e5c920875845f59821ee984b87486d05c1bf8a461b011530e02dd55", + ], + "quantized_backend": [ + "sha256": "245a3acbf06c6afe9cfb6b03eddfa015390e582ffdfb76efd23b7c810f080f10", + "sha256" + debug: "134d759fe708a4ffbf7efbd25c6020186e1a13abc0dac0a897e2fe13aac3e76a", + ], + "xnnpack_backend": [ + "sha256": "a1c9cf8347c17f3e50e45d7f37f64ee040f0a1b0a40fa4748d90b45c4150e3b2", + "sha256" + debug: "e92a15c2982630951e5ae5e927d548049db25d89e8b639e8901c5f4650f3a7d0", + ], +].reduce(into: [String: [String: Any]]()) { + $0[$1.key] = $1.value + $0[$1.key + debug] = $1.value +} +.reduce(into: [String: [String: Any]]()) { + var newValue = $1.value + if $1.key.hasSuffix(debug) { + $1.value.forEach { key, value in + if key.hasSuffix(debug) { + newValue[String(key.dropLast(debug.count))] = value + } + } + } + $0[$1.key] = newValue.filter { key, _ in !key.hasSuffix(debug) } +} let package = Package( name: "executorch", platforms: [ .iOS(.v15), ], - products: frameworks.map { .library(name: $0.name, targets: ["\($0.name)_dependencies"]) }, - targets: frameworks.flatMap { [$0.target(), $0.dependencies()] } + products: deliverables.keys.map { key in + .library(name: key, targets: ["\(key)_dependencies"]) + }.sorted { $0.name < $1.name }, + targets: deliverables.flatMap { key, value -> [Target] in + [ + .binaryTarget( + name: key, + url: "\(url)\(key)-\(version).zip", + checksum: value["sha256"] as? String ?? "" + ), + .target( + name: "\(key)_dependencies", + dependencies: [.target(name: key)], + path: ".swift/\(key)", + linkerSettings: + (value["frameworks"] as? [String] ?? []).map { .linkedFramework($0) } + + (value["libraries"] as? [String] ?? []).map { .linkedLibrary($0) } + ), + ] + } ) diff --git a/docs/source/getting-started-setup.md b/docs/source/getting-started-setup.md index 150ca5c143..ff6ad626e3 100644 --- a/docs/source/getting-started-setup.md +++ b/docs/source/getting-started-setup.md @@ -117,6 +117,11 @@ Follow these steps: ./install_requirements.sh ``` + To install with pybindings and dependencies for other backends. See options [here](https://github.com/pytorch/executorch/blob/main/install_requirements.sh#L26-L29): + ```bash + ./install_requirements.sh --pybind + ``` + You have successfully set up your environment to work with ExecuTorch. The next step is to generate a sample ExecuTorch program. diff --git a/examples/apple/coreml/scripts/export.py b/examples/apple/coreml/scripts/export.py index e71fa11e5c..e70c67e436 100644 --- a/examples/apple/coreml/scripts/export.py +++ b/examples/apple/coreml/scripts/export.py @@ -19,9 +19,11 @@ from executorch.backends.apple.coreml.partition.coreml_partitioner import ( CoreMLPartitioner, ) +from executorch.exir import to_edge from executorch.exir.backend.backend_api import to_backend from executorch.sdk.etrecord import generate_etrecord +from torch.export import export REPO_ROOT = pathlib.Path(__file__).resolve().parent.parent.parent.parent.parent EXAMPLES_DIR = REPO_ROOT / "examples" @@ -32,7 +34,6 @@ # Script to export a model with coreml delegation. -_CAPTURE_CONFIG = exir.CaptureConfig(enable_aot=True, _unlift=False) _EDGE_COMPILE_CONFIG = exir.EdgeCompileConfig( _check_ir_validity=False, ) @@ -84,9 +85,7 @@ def partition_module_to_coreml(module): def lower_module_to_coreml(module, compile_specs): module = module.eval() - edge = exir.capture(module, example_inputs, _CAPTURE_CONFIG).to_edge( - _EDGE_COMPILE_CONFIG - ) + edge = to_edge(export(module, example_inputs), compile_config=_EDGE_COMPILE_CONFIG) # All of the subsequent calls on the edge_dialect_graph generated above (such as delegation or # to_executorch()) are done in place and the graph is also modified in place. For debugging purposes # we would like to keep a copy of the original edge dialect graph and hence we create a deepcopy of @@ -95,7 +94,7 @@ def lower_module_to_coreml(module, compile_specs): lowered_module = to_backend( CoreMLBackend.__name__, - edge.exported_program, + edge.exported_program(), compile_specs, ) @@ -104,13 +103,11 @@ def lower_module_to_coreml(module, compile_specs): def export_lowered_module_to_executorch_program(lowered_module, example_inputs): lowered_module(*example_inputs) - exec_prog = ( - exir.capture(lowered_module, example_inputs, _CAPTURE_CONFIG) - .to_edge(_EDGE_COMPILE_CONFIG) - .to_executorch( - config=exir.ExecutorchBackendConfig( - extract_constant_segment=False, extract_delegate_segments=True - ) + exec_prog = to_edge( + export(lowered_module, example_inputs), compile_config=_EDGE_COMPILE_CONFIG + ).to_executorch( + config=exir.ExecutorchBackendConfig( + extract_constant_segment=False, extract_delegate_segments=True ) ) diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java index b963cc57c1..dc34b22818 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java @@ -30,19 +30,26 @@ public class MainActivity extends Activity implements Runnable, LlamaCallback { private LlamaModule mModule = null; private Message mResultMessage = null; - private int mNumTokens = 0; - private long mRunStartTime = 0; private String mModelFilePath = ""; private String mTokenizerFilePath = ""; @Override public void onResult(String result) { - System.out.println("onResult: " + result); mResultMessage.appendText(result); - mNumTokens++; run(); } + @Override + public void onStats(float tps) { + runOnUiThread( + () -> { + if (mResultMessage != null) { + mResultMessage.setTokensPerSecond(tps); + mMessageAdapter.notifyDataSetChanged(); + } + }); + } + private static String[] listLocalFile(String path, String suffix) { File directory = new File(path); if (directory.exists() && directory.isDirectory()) { @@ -79,14 +86,14 @@ private void setLocalModel(String modelPath, String tokenizerPath) { }); } - long runDuration = System.currentTimeMillis() - runStartTime; + long loadDuration = System.currentTimeMillis() - runStartTime; String modelInfo = "Model path: " + modelPath + "\nTokenizer path: " + tokenizerPath + "\nModel loaded time: " - + runDuration + + loadDuration + " ms"; Message modelLoadedMessage = new Message(modelInfo, false); runOnUiThread( @@ -175,16 +182,10 @@ private void onModelRunStarted() { view -> { mModule.stop(); }); - - mRunStartTime = System.currentTimeMillis(); } private void onModelRunStopped() { setTitle(memoryInfo()); - long runDuration = System.currentTimeMillis() - mRunStartTime; - if (mResultMessage != null) { - mResultMessage.setTokensPerSecond(1.0f * mNumTokens / (runDuration / 1000.0f)); - } mSendButton.setText("Generate"); mSendButton.setOnClickListener( view -> { @@ -219,8 +220,6 @@ public void run() { }; new Thread(runnable).start(); }); - mNumTokens = 0; - mRunStartTime = 0; mMessageAdapter.notifyDataSetChanged(); } diff --git a/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj b/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj index 470d0ca300..80ab3c34b0 100644 --- a/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj +++ b/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj @@ -17,6 +17,21 @@ 0324D6922BAACB6900DEF36F /* ResourceManager.swift in Sources */ = {isa = PBXBuildFile; fileRef = 0324D6872BAACB6900DEF36F /* ResourceManager.swift */; }; 0324D6932BAACB6900DEF36F /* ResourceMonitor.swift in Sources */ = {isa = PBXBuildFile; fileRef = 0324D6882BAACB6900DEF36F /* ResourceMonitor.swift */; }; 0324D6962BAACB7000DEF36F /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 0324D6942BAACB7000DEF36F /* Assets.xcassets */; }; + 03312C192BBFC940002106EF /* coreml_backend in Frameworks */ = {isa = PBXBuildFile; productRef = 03312C182BBFC940002106EF /* coreml_backend */; }; + 03312C1B2BBFC940002106EF /* coreml_backend_debug in Frameworks */ = {isa = PBXBuildFile; productRef = 03312C1A2BBFC940002106EF /* coreml_backend_debug */; settings = {ATTRIBUTES = (Required, ); }; }; + 03312C1D2BBFC940002106EF /* custom_backend in Frameworks */ = {isa = PBXBuildFile; productRef = 03312C1C2BBFC940002106EF /* custom_backend */; settings = {ATTRIBUTES = (Required, ); }; }; + 03312C1F2BBFC940002106EF /* custom_backend_debug in Frameworks */ = {isa = PBXBuildFile; productRef = 03312C1E2BBFC940002106EF /* custom_backend_debug */; settings = {ATTRIBUTES = (Required, ); }; }; + 03312C252BBFC940002106EF /* mps_backend in Frameworks */ = {isa = PBXBuildFile; productRef = 03312C242BBFC940002106EF /* mps_backend */; }; + 03312C272BBFC940002106EF /* mps_backend_debug in Frameworks */ = {isa = PBXBuildFile; productRef = 03312C262BBFC940002106EF /* mps_backend_debug */; settings = {ATTRIBUTES = (Required, ); }; }; + 03312C292BBFC940002106EF /* optimized_backend in Frameworks */ = {isa = PBXBuildFile; productRef = 03312C282BBFC940002106EF /* optimized_backend */; }; + 03312C2B2BBFC940002106EF /* optimized_backend_debug in Frameworks */ = {isa = PBXBuildFile; productRef = 03312C2A2BBFC940002106EF /* optimized_backend_debug */; settings = {ATTRIBUTES = (Required, ); }; }; + 03312C2D2BBFC940002106EF /* portable_backend in Frameworks */ = {isa = PBXBuildFile; productRef = 03312C2C2BBFC940002106EF /* portable_backend */; settings = {ATTRIBUTES = (Required, ); }; }; + 03312C2F2BBFC940002106EF /* portable_backend_debug in Frameworks */ = {isa = PBXBuildFile; productRef = 03312C2E2BBFC940002106EF /* portable_backend_debug */; settings = {ATTRIBUTES = (Required, ); }; }; + 03312C312BBFC940002106EF /* quantized_backend in Frameworks */ = {isa = PBXBuildFile; productRef = 03312C302BBFC940002106EF /* quantized_backend */; }; + 03312C332BBFC940002106EF /* quantized_backend_debug in Frameworks */ = {isa = PBXBuildFile; productRef = 03312C322BBFC940002106EF /* quantized_backend_debug */; settings = {ATTRIBUTES = (Required, ); }; }; + 03312C352BBFC940002106EF /* xnnpack_backend in Frameworks */ = {isa = PBXBuildFile; productRef = 03312C342BBFC940002106EF /* xnnpack_backend */; }; + 03312C372BBFC940002106EF /* xnnpack_backend_debug in Frameworks */ = {isa = PBXBuildFile; productRef = 03312C362BBFC940002106EF /* xnnpack_backend_debug */; settings = {ATTRIBUTES = (Required, ); }; }; + 03312C3E2BBFD076002106EF /* executorch_debug in Frameworks */ = {isa = PBXBuildFile; productRef = 03312C3D2BBFD076002106EF /* executorch_debug */; }; 03729EDB2BB1F8DE00152F2E /* LLaMARunner.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03729ED52BB1F8DE00152F2E /* LLaMARunner.framework */; }; 03729EDC2BB1F8DE00152F2E /* LLaMARunner.framework in Embed Frameworks */ = {isa = PBXBuildFile; fileRef = 03729ED52BB1F8DE00152F2E /* LLaMARunner.framework */; settings = {ATTRIBUTES = (CodeSignOnCopy, RemoveHeadersOnCopy, ); }; }; 03729EE12BB1F93800152F2E /* LLaMARunner.mm in Sources */ = {isa = PBXBuildFile; fileRef = 0324D69A2BAACB7C00DEF36F /* LLaMARunner.mm */; }; @@ -28,14 +43,6 @@ 03729F132BB2042B00152F2E /* sampler.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 03729F112BB2042B00152F2E /* sampler.cpp */; }; 03729F162BB2043600152F2E /* tokenizer.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 03729F142BB2043600152F2E /* tokenizer.cpp */; }; 03729F172BB2043600152F2E /* tokenizer.h in Headers */ = {isa = PBXBuildFile; fileRef = 03729F152BB2043600152F2E /* tokenizer.h */; }; - 03FF63422BBB19A40059C911 /* coreml_backend in Frameworks */ = {isa = PBXBuildFile; productRef = 03FF63412BBB19A40059C911 /* coreml_backend */; }; - 03FF63442BBB19A40059C911 /* custom_backend in Frameworks */ = {isa = PBXBuildFile; productRef = 03FF63432BBB19A40059C911 /* custom_backend */; }; - 03FF63462BBB19A40059C911 /* executorch in Frameworks */ = {isa = PBXBuildFile; productRef = 03FF63452BBB19A40059C911 /* executorch */; }; - 03FF63482BBB19A40059C911 /* mps_backend in Frameworks */ = {isa = PBXBuildFile; productRef = 03FF63472BBB19A40059C911 /* mps_backend */; }; - 03FF634A2BBB19A40059C911 /* optimized_backend in Frameworks */ = {isa = PBXBuildFile; productRef = 03FF63492BBB19A40059C911 /* optimized_backend */; }; - 03FF634C2BBB19A40059C911 /* portable_backend in Frameworks */ = {isa = PBXBuildFile; productRef = 03FF634B2BBB19A40059C911 /* portable_backend */; }; - 03FF634E2BBB19A40059C911 /* quantized_backend in Frameworks */ = {isa = PBXBuildFile; productRef = 03FF634D2BBB19A40059C911 /* quantized_backend */; }; - 03FF63502BBB19A40059C911 /* xnnpack_backend in Frameworks */ = {isa = PBXBuildFile; productRef = 03FF634F2BBB19A40059C911 /* xnnpack_backend */; }; /* End PBXBuildFile section */ /* Begin PBXContainerItemProxy section */ @@ -93,14 +100,21 @@ isa = PBXFrameworksBuildPhase; buildActionMask = 2147483647; files = ( + 03312C272BBFC940002106EF /* mps_backend_debug in Frameworks */, + 03312C312BBFC940002106EF /* quantized_backend in Frameworks */, + 03312C252BBFC940002106EF /* mps_backend in Frameworks */, + 03312C2D2BBFC940002106EF /* portable_backend in Frameworks */, + 03312C1D2BBFC940002106EF /* custom_backend in Frameworks */, + 03312C352BBFC940002106EF /* xnnpack_backend in Frameworks */, + 03312C372BBFC940002106EF /* xnnpack_backend_debug in Frameworks */, + 03312C1B2BBFC940002106EF /* coreml_backend_debug in Frameworks */, 03729EDB2BB1F8DE00152F2E /* LLaMARunner.framework in Frameworks */, - 03FF63502BBB19A40059C911 /* xnnpack_backend in Frameworks */, - 03FF634A2BBB19A40059C911 /* optimized_backend in Frameworks */, - 03FF634E2BBB19A40059C911 /* quantized_backend in Frameworks */, - 03FF634C2BBB19A40059C911 /* portable_backend in Frameworks */, - 03FF63442BBB19A40059C911 /* custom_backend in Frameworks */, - 03FF63482BBB19A40059C911 /* mps_backend in Frameworks */, - 03FF63422BBB19A40059C911 /* coreml_backend in Frameworks */, + 03312C2B2BBFC940002106EF /* optimized_backend_debug in Frameworks */, + 03312C2F2BBFC940002106EF /* portable_backend_debug in Frameworks */, + 03312C292BBFC940002106EF /* optimized_backend in Frameworks */, + 03312C192BBFC940002106EF /* coreml_backend in Frameworks */, + 03312C332BBFC940002106EF /* quantized_backend_debug in Frameworks */, + 03312C1F2BBFC940002106EF /* custom_backend_debug in Frameworks */, ); runOnlyForDeploymentPostprocessing = 0; }; @@ -108,7 +122,7 @@ isa = PBXFrameworksBuildPhase; buildActionMask = 2147483647; files = ( - 03FF63462BBB19A40059C911 /* executorch in Frameworks */, + 03312C3E2BBFD076002106EF /* executorch_debug in Frameworks */, ); runOnlyForDeploymentPostprocessing = 0; }; @@ -269,13 +283,20 @@ ); name = LLaMA; packageProductDependencies = ( - 03FF63412BBB19A40059C911 /* coreml_backend */, - 03FF63432BBB19A40059C911 /* custom_backend */, - 03FF63472BBB19A40059C911 /* mps_backend */, - 03FF63492BBB19A40059C911 /* optimized_backend */, - 03FF634B2BBB19A40059C911 /* portable_backend */, - 03FF634D2BBB19A40059C911 /* quantized_backend */, - 03FF634F2BBB19A40059C911 /* xnnpack_backend */, + 03312C182BBFC940002106EF /* coreml_backend */, + 03312C1A2BBFC940002106EF /* coreml_backend_debug */, + 03312C1C2BBFC940002106EF /* custom_backend */, + 03312C1E2BBFC940002106EF /* custom_backend_debug */, + 03312C242BBFC940002106EF /* mps_backend */, + 03312C262BBFC940002106EF /* mps_backend_debug */, + 03312C282BBFC940002106EF /* optimized_backend */, + 03312C2A2BBFC940002106EF /* optimized_backend_debug */, + 03312C2C2BBFC940002106EF /* portable_backend */, + 03312C2E2BBFC940002106EF /* portable_backend_debug */, + 03312C302BBFC940002106EF /* quantized_backend */, + 03312C322BBFC940002106EF /* quantized_backend_debug */, + 03312C342BBFC940002106EF /* xnnpack_backend */, + 03312C362BBFC940002106EF /* xnnpack_backend_debug */, ); productName = LLaMA; productReference = 036CAF9D2BB1444500D6C2D5 /* LLaMA.app */; @@ -296,7 +317,7 @@ ); name = LLaMARunner; packageProductDependencies = ( - 03FF63452BBB19A40059C911 /* executorch */, + 03312C3D2BBFD076002106EF /* executorch_debug */, ); productName = LLaMARunner; productReference = 03729ED52BB1F8DE00152F2E /* LLaMARunner.framework */; @@ -330,7 +351,7 @@ ); mainGroup = 032C01662AC228E5002955E1; packageReferences = ( - 03FF63402BBB19A40059C911 /* XCRemoteSwiftPackageReference "executorch" */, + 03312C172BBFC940002106EF /* XCRemoteSwiftPackageReference "executorch" */, ); productRefGroup = 032C01662AC228E5002955E1; projectDirPath = ""; @@ -556,27 +577,27 @@ OTHER_LDFLAGS = ""; "OTHER_LDFLAGS[sdk=iphoneos*]" = ( "-force_load", - "$(BUILT_PRODUCTS_DIR)/libportable_backend-Release-0.a", + "$(BUILT_PRODUCTS_DIR)/libportable_backend-Debug-0.a", "-force_load", - "$(BUILT_PRODUCTS_DIR)/libcustom_backend-Release-0.a", + "$(BUILT_PRODUCTS_DIR)/libcustom_backend-Debug-0.a", "-force_load", - "$(BUILT_PRODUCTS_DIR)/libxnnpack_backend-Release-0.a", + "$(BUILT_PRODUCTS_DIR)/libxnnpack_backend-Debug-0.a", "-force_load", - "$(BUILT_PRODUCTS_DIR)/libcoreml_backend-Release-0.a", + "$(BUILT_PRODUCTS_DIR)/libcoreml_backend-Debug-0.a", "-force_load", - "$(BUILT_PRODUCTS_DIR)/libmps_backend-Release-0.a", + "$(BUILT_PRODUCTS_DIR)/libmps_backend-Debug-0.a", ); "OTHER_LDFLAGS[sdk=iphonesimulator*]" = ( "-force_load", - "$(BUILT_PRODUCTS_DIR)/libportable_backend-Release-1.a", + "$(BUILT_PRODUCTS_DIR)/libportable_backend-Debug-1.a", "-force_load", - "$(BUILT_PRODUCTS_DIR)/libcustom_backend-Release-1.a", + "$(BUILT_PRODUCTS_DIR)/libcustom_backend-Debug-1.a", "-force_load", - "$(BUILT_PRODUCTS_DIR)/libxnnpack_backend-Release-1.a", + "$(BUILT_PRODUCTS_DIR)/libxnnpack_backend-Debug-1.a", "-force_load", - "$(BUILT_PRODUCTS_DIR)/libcoreml_backend-Release-1.a", + "$(BUILT_PRODUCTS_DIR)/libcoreml_backend-Debug-1.a", "-force_load", - "$(BUILT_PRODUCTS_DIR)/libmps_backend-Release-1.a", + "$(BUILT_PRODUCTS_DIR)/libmps_backend-Debug-1.a", ); PRODUCT_BUNDLE_IDENTIFIER = org.pytorch.executorch.llama; PRODUCT_NAME = "$(PROJECT_NAME)"; @@ -673,11 +694,11 @@ OTHER_LDFLAGS = ""; "OTHER_LDFLAGS[sdk=iphoneos*]" = ( "-force_load", - "$(BUILT_PRODUCTS_DIR)/libexecutorch-Release-0.a", + "$(BUILT_PRODUCTS_DIR)/libexecutorch-Debug-0.a", ); "OTHER_LDFLAGS[sdk=iphonesimulator*]" = ( "-force_load", - "$(BUILT_PRODUCTS_DIR)/libexecutorch-Release-1.a", + "$(BUILT_PRODUCTS_DIR)/libexecutorch-Debug-1.a", ); PRODUCT_BUNDLE_IDENTIFIER = org.pytorch.executorch.llama.LLaMARunner; PRODUCT_NAME = "$(TARGET_NAME:c99extidentifier)"; @@ -718,11 +739,11 @@ OTHER_LDFLAGS = ""; "OTHER_LDFLAGS[sdk=iphoneos*]" = ( "-force_load", - "$(BUILT_PRODUCTS_DIR)/libexecutorch-Release-0.a", + "$(BUILT_PRODUCTS_DIR)/libexecutorch-Debug-0.a", ); "OTHER_LDFLAGS[sdk=iphonesimulator*]" = ( "-force_load", - "$(BUILT_PRODUCTS_DIR)/libexecutorch-Release-1.a", + "$(BUILT_PRODUCTS_DIR)/libexecutorch-Debug-1.a", ); PRODUCT_BUNDLE_IDENTIFIER = org.pytorch.executorch.llama.LLaMARunner; PRODUCT_NAME = "$(TARGET_NAME:c99extidentifier)"; @@ -771,7 +792,7 @@ /* End XCConfigurationList section */ /* Begin XCRemoteSwiftPackageReference section */ - 03FF63402BBB19A40059C911 /* XCRemoteSwiftPackageReference "executorch" */ = { + 03312C172BBFC940002106EF /* XCRemoteSwiftPackageReference "executorch" */ = { isa = XCRemoteSwiftPackageReference; repositoryURL = "https://github.com/pytorch/executorch"; requirement = { @@ -782,46 +803,81 @@ /* End XCRemoteSwiftPackageReference section */ /* Begin XCSwiftPackageProductDependency section */ - 03FF63412BBB19A40059C911 /* coreml_backend */ = { + 03312C182BBFC940002106EF /* coreml_backend */ = { isa = XCSwiftPackageProductDependency; - package = 03FF63402BBB19A40059C911 /* XCRemoteSwiftPackageReference "executorch" */; + package = 03312C172BBFC940002106EF /* XCRemoteSwiftPackageReference "executorch" */; productName = coreml_backend; }; - 03FF63432BBB19A40059C911 /* custom_backend */ = { + 03312C1A2BBFC940002106EF /* coreml_backend_debug */ = { + isa = XCSwiftPackageProductDependency; + package = 03312C172BBFC940002106EF /* XCRemoteSwiftPackageReference "executorch" */; + productName = coreml_backend_debug; + }; + 03312C1C2BBFC940002106EF /* custom_backend */ = { isa = XCSwiftPackageProductDependency; - package = 03FF63402BBB19A40059C911 /* XCRemoteSwiftPackageReference "executorch" */; + package = 03312C172BBFC940002106EF /* XCRemoteSwiftPackageReference "executorch" */; productName = custom_backend; }; - 03FF63452BBB19A40059C911 /* executorch */ = { + 03312C1E2BBFC940002106EF /* custom_backend_debug */ = { isa = XCSwiftPackageProductDependency; - package = 03FF63402BBB19A40059C911 /* XCRemoteSwiftPackageReference "executorch" */; - productName = executorch; + package = 03312C172BBFC940002106EF /* XCRemoteSwiftPackageReference "executorch" */; + productName = custom_backend_debug; }; - 03FF63472BBB19A40059C911 /* mps_backend */ = { + 03312C242BBFC940002106EF /* mps_backend */ = { isa = XCSwiftPackageProductDependency; - package = 03FF63402BBB19A40059C911 /* XCRemoteSwiftPackageReference "executorch" */; + package = 03312C172BBFC940002106EF /* XCRemoteSwiftPackageReference "executorch" */; productName = mps_backend; }; - 03FF63492BBB19A40059C911 /* optimized_backend */ = { + 03312C262BBFC940002106EF /* mps_backend_debug */ = { isa = XCSwiftPackageProductDependency; - package = 03FF63402BBB19A40059C911 /* XCRemoteSwiftPackageReference "executorch" */; + package = 03312C172BBFC940002106EF /* XCRemoteSwiftPackageReference "executorch" */; + productName = mps_backend_debug; + }; + 03312C282BBFC940002106EF /* optimized_backend */ = { + isa = XCSwiftPackageProductDependency; + package = 03312C172BBFC940002106EF /* XCRemoteSwiftPackageReference "executorch" */; productName = optimized_backend; }; - 03FF634B2BBB19A40059C911 /* portable_backend */ = { + 03312C2A2BBFC940002106EF /* optimized_backend_debug */ = { + isa = XCSwiftPackageProductDependency; + package = 03312C172BBFC940002106EF /* XCRemoteSwiftPackageReference "executorch" */; + productName = optimized_backend_debug; + }; + 03312C2C2BBFC940002106EF /* portable_backend */ = { isa = XCSwiftPackageProductDependency; - package = 03FF63402BBB19A40059C911 /* XCRemoteSwiftPackageReference "executorch" */; + package = 03312C172BBFC940002106EF /* XCRemoteSwiftPackageReference "executorch" */; productName = portable_backend; }; - 03FF634D2BBB19A40059C911 /* quantized_backend */ = { + 03312C2E2BBFC940002106EF /* portable_backend_debug */ = { + isa = XCSwiftPackageProductDependency; + package = 03312C172BBFC940002106EF /* XCRemoteSwiftPackageReference "executorch" */; + productName = portable_backend_debug; + }; + 03312C302BBFC940002106EF /* quantized_backend */ = { isa = XCSwiftPackageProductDependency; - package = 03FF63402BBB19A40059C911 /* XCRemoteSwiftPackageReference "executorch" */; + package = 03312C172BBFC940002106EF /* XCRemoteSwiftPackageReference "executorch" */; productName = quantized_backend; }; - 03FF634F2BBB19A40059C911 /* xnnpack_backend */ = { + 03312C322BBFC940002106EF /* quantized_backend_debug */ = { isa = XCSwiftPackageProductDependency; - package = 03FF63402BBB19A40059C911 /* XCRemoteSwiftPackageReference "executorch" */; + package = 03312C172BBFC940002106EF /* XCRemoteSwiftPackageReference "executorch" */; + productName = quantized_backend_debug; + }; + 03312C342BBFC940002106EF /* xnnpack_backend */ = { + isa = XCSwiftPackageProductDependency; + package = 03312C172BBFC940002106EF /* XCRemoteSwiftPackageReference "executorch" */; productName = xnnpack_backend; }; + 03312C362BBFC940002106EF /* xnnpack_backend_debug */ = { + isa = XCSwiftPackageProductDependency; + package = 03312C172BBFC940002106EF /* XCRemoteSwiftPackageReference "executorch" */; + productName = xnnpack_backend_debug; + }; + 03312C3D2BBFD076002106EF /* executorch_debug */ = { + isa = XCSwiftPackageProductDependency; + package = 03312C172BBFC940002106EF /* XCRemoteSwiftPackageReference "executorch" */; + productName = executorch_debug; + }; /* End XCSwiftPackageProductDependency section */ }; rootObject = 032C01672AC228E5002955E1 /* Project object */; diff --git a/examples/models/llama2/README.md b/examples/models/llama2/README.md index 0f9476868d..1db5c9b84e 100644 --- a/examples/models/llama2/README.md +++ b/examples/models/llama2/README.md @@ -17,10 +17,38 @@ Please note that the models are subject to the [acceptable use policy](https://g # Results -TODO - Will fill in table of results. +Since 7B Llama2 model needs at least 4-bit quantization to fit even within some of the highend phones, results presented here correspond to 4-bit groupwise post-training quantized model. + +## Quantization: +We employed 4-bit groupwise per token dynamic quantization of all the linear layers of the model. Dynamic quantization refers to quantizating activations dynamically, such that quantization parameters for activations are calculated, from min/max range, at runtime. Here we quantized activations with 8bits (signed integer). Furthermore, weights are statically quantized. In our case weights were per-channel groupwise quantized with 4bit signed integer. For more information refer to this [page](https://pytorch.org/tutorials/recipes/recipes/dynamic_quantization.html). + +We evaluated WikiText perplexity using [LM Eval](https://github.com/EleutherAI/lm-evaluation-harness). Below are the results for two different groupsizes. + +|Llama 2 | Baseline (FP32) | Groupwise 4-bit (128) | Groupwise 4-bit (256) +|--------|-----------------| ---------------------- | --------------- +|Wikitext Perplexity | 9.16 | 10.2 | 10.7 + +Note that groupsize less than 128 was not enabled, since such model were still too large. This is because our current efforts have focused on enabling FP32 and support for FP16 is under way. What this implies for model size is that 1) embedding table is in FP32 and 2) quantized weights scales are FP32. + +## Performance + +Performance was measured on Samsung Galaxy S22, S23, S24 and One Plus 12. Measurement performance is in terms of tokens/second. + +|Device | Groupwise 4-bit (128) | Groupwise 4-bit (256) +|--------| ---------------------- | --------------- +|Galaxy S22 | x | x | +|Galaxy S24 | x | x | +|One plus 12 | x | x | +|iPhone 15 pro | x | x | + # Instructions +## Tested on + +- MacOS M1/M2, Linux. +- For Llama7b, your device may require at least 32GB RAM. If this is a constraint for you, please try the smaller stories model. + ## Step 1: Setup 1. Follow the [tutorial](https://pytorch.org/executorch/main/getting-started-setup) to set up ExecuTorch 2. Run `examples/models/llama2/install_requirements.sh` to install a few dependencies. @@ -59,29 +87,64 @@ If you want to deploy and run a smaller model for educational purposes. From `ex ``` 4. Create tokenizer.bin. - Build with buck2: ``` python -m examples.models.llama2.tokenizer.tokenizer -t tokenizer.model -o tokenizer.bin ``` -## Step 3: Run on your computer to validate +## Step 3: Evaluate model accuracy + +> Forewarning: Model evaluation without a GPU may take a long time, especially on larger models. -1. Build llama runner. TODO +Using the same arguments from above +``` +python -m examples.models.llama2.eval_llama -c -p -t -d fp32 --max_seq_len --limit +``` -2. Run model. Run options available [here](https://github.com/pytorch/executorch/blob/main/examples/models/llama2/main.cpp#L13). - Build with buck2: +The Wikitext results generated above used: `{max_seq_len: 2048, limit: 1000}` + +## Step 4: Run on your computer to validate + +1. Build executorch with XNNPACK enabled. Build options available [here](https://github.com/pytorch/executorch/blob/main/CMakeLists.txt#L59). ``` - buck2 run examples/models/llama2:main -- --model_path=llama2.pte --tokenizer_path=tokenizer.bin --prompt="Once" + cmake -DBUCK2=/tmp/buck2 \ + -DPYTHON_EXECUTABLE=python \ + -DCMAKE_INSTALL_PREFIX=cmake-out \ + -DEXECUTORCH_ENABLE_LOGGING=1 \ + -DCMAKE_BUILD_TYPE=Release \ + -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ + -DEXECUTORCH_BUILD_XNNPACK=ON \ + -DEXECUTORCH_BUILD_OPTIMIZED=ON \ + -Bcmake-out . + + cmake --build cmake-out -j16 --target install --config Release ``` - Build with cmake: TODO -## Step 4: Run benchmark on Android phone +2. Build llama runner. + ``` + cmake -DBUCK2=/tmp/buck2 \ + -DPYTHON_EXECUTABLE=python \ + -DCMAKE_INSTALL_PREFIX=cmake-out \ + -DCMAKE_BUILD_TYPE=Release \ + -DEXECUTORCH_BUILD_OPTIMIZED=ON \ + -Bcmake-out/examples/models/llama2 \ + examples/models/llama2 + + cmake --build cmake-out/examples/models/llama2 -j16 --config Release + ``` + +3. Run model. Run options available [here](https://github.com/pytorch/executorch/blob/main/examples/models/llama2/main.cpp#L18-L40). + ``` + cmake-out/examples/models/llama2/llama_main --model_path= --tokenizer_path= --prompt= + ``` + +## Step 5: Run benchmark on Android phone 1. Build llama runner binary for Android 2. Run on Android via adb shell -## Step 5: Build iOS and/or Android apps +## Step 6: Build iOS and/or Android apps TODO @@ -94,3 +157,14 @@ This example tries to reuse the Python code, with minimal modifications to make 1. Since ExecuTorch does not support complex Tensor data type, use the customized functions to have rotary embedding with real numbers. Please see [GitHub issue: Support complex data type in ExecuTorch](https://github.com/pytorch/executorch/issues/886). 2. No CUDA. ExecuTorch is focused on Edge use cases where CUDA is not available on most of the edge devices. 3. No dependencies on fairscale. The ColumnParallelLinear, ParallelEmbedding and training are not needed and supported in ExecuTorch. + + +# Clean +To clean your build: +``` +git clean -xfd +pip uninstall executorch +./install_requirements.sh + +rm -rf cmake-out +``` diff --git a/examples/models/llama2/eval_llama_lib.py b/examples/models/llama2/eval_llama_lib.py index d15c04cb23..c965053170 100644 --- a/examples/models/llama2/eval_llama_lib.py +++ b/examples/models/llama2/eval_llama_lib.py @@ -9,8 +9,8 @@ from typing import Optional import lm_eval - import torch + from lm_eval.api.model import LM from lm_eval.evaluator import evaluate from lm_eval.models.huggingface import HFLM as eval_wrapper @@ -33,7 +33,7 @@ class GPTFastEvalWrapper(eval_wrapper): def __init__( self, model: nn.Module, - tokenizer, + tokenizer: SentencePieceProcessor, max_seq_length: Optional[int] = None, ): super().__init__() @@ -97,16 +97,18 @@ def __init__( max_seq_length: Optional[int] = None, ): super().__init__(None, tokenizer, max_seq_length) - self._model = model + self._model = model # Expects model to be path to a .pte file - def _model_call(self, inps): - # Given inps (tokens), return the logits from a single - # forward call + from executorch.extension.pybindings.portable_lib import _load_for_executorch - # Example: - # inps: Tensor of shape (1, N) - # logits: Tensor of shape (1, N, 32000) - pass + self._et_model = _load_for_executorch(self._model) + + def _model_call(self, inps): + # Given inps (tokens), return the logits from a single forward call + # inps: Tensor of shape (1, max_seq_len - 1) + # logits: Tensor of shape (1, max_seq_len - 1, 32000) + result = self._et_model.forward((inps,)) + return result[0] class ETRunnerEvalWrapper(GPTFastEvalWrapper): @@ -198,7 +200,9 @@ def gen_eval_wrapper( return ETEagerEvalWrapper( model=model, tokenizer=tokenizer, - max_seq_length=args.max_seq_length, + # Exported model takes at most (max_seq_length - 1) tokens. + # Note that the eager model takes at most max_seq_length tokens. + max_seq_length=args.max_seq_length - 1, ) # GPTFastEvalWrapper: Create a wrapper around a pre-exported model diff --git a/examples/models/llama2/export_llama_lib.py b/examples/models/llama2/export_llama_lib.py index 1e5038f1cb..de1e711a2c 100644 --- a/examples/models/llama2/export_llama_lib.py +++ b/examples/models/llama2/export_llama_lib.py @@ -134,7 +134,7 @@ def quantize( from torchao.quantization.quant_api import Int8DynActInt4WeightQuantizer model = Int8DynActInt4WeightQuantizer( - precision=torch_dtype, group_size=group_size + precision=torch_dtype, groupsize=group_size ).quantize(model) if verbose_export(): print("quantized model:", model) @@ -153,6 +153,7 @@ def quantize( if calibration_tasks is None: calibration_tasks = ["wikitext"] + from torchao.quantization.GPTQ import InputRecorder from torchao.quantization.quant_api import Int8DynActInt4WeightGPTQQuantizer if tokenizer_path is None: @@ -161,17 +162,28 @@ def quantize( tokenizer = SentencePieceProcessor( # pyre-ignore[28] model_file=str(tokenizer_path) ) + + inputs = ( + InputRecorder( + tokenizer, + calibration_seq_length, + None, # input_prep_func + pad_calibration_inputs, + model.vocab_size, + ) + .record_inputs( + calibration_tasks, + calibration_limit, + ) + .get_inputs() + ) + gptq_quantizer = Int8DynActInt4WeightGPTQQuantizer( - tokenizer, blocksize, percdamp, group_size, - calibration_tasks, - calibration_limit, - calibration_seq_length, - pad_calibration_inputs, ) - model = gptq_quantizer.quantize(model) + model = gptq_quantizer.quantize(model, inputs) return model else: raise Exception(f"Unrecognized quantize mode: {qmode}") @@ -593,7 +605,10 @@ def _export_llama(modelname, args) -> str: # noqa: C901 partitioners.append( # pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `apple` CoreMLPartitioner( - skip_ops_for_coreml_delegation=None, compile_specs=compile_specs + skip_ops_for_coreml_delegation=[ + "aten.index_put.default", + ], + compile_specs=compile_specs, ) ) modelname = f"coreml_{modelname}" diff --git a/examples/models/llama2/runner/runner.cpp b/examples/models/llama2/runner/runner.cpp index 2808aa3c9b..af7c25ec67 100644 --- a/examples/models/llama2/runner/runner.cpp +++ b/examples/models/llama2/runner/runner.cpp @@ -29,6 +29,8 @@ namespace torch::executor { namespace { static constexpr auto kTopp = 0.9f; +void printReport(const Runner::Stats& stats); +std::string statsToJsonString(const Runner::Stats& stats); } // namespace Runner::Runner( @@ -208,20 +210,21 @@ Result Runner::run_model_step( Error Runner::generate( const std::string& prompt, int32_t seq_len, - std::function callback) { + std::function token_callback, + std::function stats_callback) { // Prepare the inputs. // Use ones-initialized inputs. ET_CHECK_MSG(!prompt.empty(), "Prompt cannot be null"); if (!is_loaded()) { - timers_.model_load_start_ms = util::time_in_ms(); + stats_.model_load_start_ms = util::time_in_ms(); ET_CHECK_OK_OR_RETURN_ERROR(load()); - timers_.model_load_end_ms = util::time_in_ms(); + stats_.model_load_end_ms = util::time_in_ms(); } // First token time only measures the time it takes to encode the prompt and // return a response token. - timers_.inference_start_ms = util::time_in_ms(); + stats_.inference_start_ms = util::time_in_ms(); shouldStop_ = false; // encode the (string) prompt into tokens sequence @@ -319,9 +322,9 @@ Error Runner::generate( run_model_step(cur_token, tokens_managed, start_pos_managed, seq_len); if (pos == num_prompt_tokens) { - timers_.first_token_ms = util::time_in_ms(); + stats_.first_token_ms = util::time_in_ms(); } else if (pos == num_prompt_tokens - 1) { - timers_.prompt_eval_end_ms = util::time_in_ms(); + stats_.prompt_eval_end_ms = util::time_in_ms(); } ET_CHECK_OK_OR_RETURN_ERROR(logits_res.error()); @@ -345,7 +348,7 @@ Error Runner::generate( "Unsupported dtype output %hhd", static_cast(logits_tensor.scalar_type())); } - timers_.aggregate_sampling_time_ms += + stats_.aggregate_sampling_time_ms += util::time_in_ms() - sample_start_time_ms; // advance the state machine @@ -364,8 +367,8 @@ Error Runner::generate( util::safe_printf(piece); fflush(stdout); - if (callback) { - callback(piece); + if (token_callback) { + token_callback(piece); } if (shouldStop_) { @@ -379,93 +382,102 @@ Error Runner::generate( break; } } - timers_.inference_end_ms = util::time_in_ms(); + stats_.inference_end_ms = util::time_in_ms(); printf("\n"); if (pos == seq_len) { ET_LOG(Info, "Sequence length (%i tokens) reached!", seq_len); } - timers_.printReport(num_prompt_tokens, pos - num_prompt_tokens); + stats_.num_prompt_tokens = num_prompt_tokens; + stats_.num_generated_tokens = pos - num_prompt_tokens; + printReport(stats_); + if (stats_callback) { + stats_callback(stats_); + } delete[] prompt_tokens; return Error::Ok; } -void Runner::TimeStamps::printReport( - const int64_t& num_prompt_tokens, - const int64_t& num_generated_tokens) { - printf( - "PyTorchObserver %s\n", - toJsonString(num_prompt_tokens, num_generated_tokens).c_str()); +namespace { +void printReport(const Runner::Stats& stats) { + printf("PyTorchObserver %s\n", statsToJsonString(stats).c_str()); ET_LOG( Info, "\tPrompt Tokens: %" PRIu64 " Generated Tokens: %" PRIu64, - num_prompt_tokens, - num_generated_tokens); + stats.num_prompt_tokens, + stats.num_generated_tokens); ET_LOG( Info, "\tModel Load Time:\t\t%f (seconds)", - ((double)(model_load_end_ms - model_load_start_ms) / - SCALING_FACTOR_UNITS_PER_SECOND)); - double inference_time_ms = (double)(inference_end_ms - inference_start_ms); + ((double)(stats.model_load_end_ms - stats.model_load_start_ms) / + stats.SCALING_FACTOR_UNITS_PER_SECOND)); + double inference_time_ms = + (double)(stats.inference_end_ms - stats.inference_start_ms); ET_LOG( Info, "\tTotal inference time:\t\t%f (seconds)\t\t Rate: \t%f (tokens/second)", - inference_time_ms / SCALING_FACTOR_UNITS_PER_SECOND, + inference_time_ms / stats.SCALING_FACTOR_UNITS_PER_SECOND, - (num_generated_tokens) / (double)(inference_end_ms - inference_start_ms) * - SCALING_FACTOR_UNITS_PER_SECOND); - double prompt_eval_time = (double)(prompt_eval_end_ms - inference_start_ms); + (stats.num_generated_tokens) / + (double)(stats.inference_end_ms - stats.inference_start_ms) * + stats.SCALING_FACTOR_UNITS_PER_SECOND); + double prompt_eval_time = + (double)(stats.prompt_eval_end_ms - stats.inference_start_ms); ET_LOG( Info, "\t\tPrompt evaluation:\t%f (seconds)\t\t Rate: \t%f (tokens/second)", - prompt_eval_time / SCALING_FACTOR_UNITS_PER_SECOND, - (num_prompt_tokens) / prompt_eval_time * SCALING_FACTOR_UNITS_PER_SECOND); + prompt_eval_time / stats.SCALING_FACTOR_UNITS_PER_SECOND, + (stats.num_prompt_tokens) / prompt_eval_time * + stats.SCALING_FACTOR_UNITS_PER_SECOND); - double eval_time = (double)(inference_end_ms - prompt_eval_end_ms); + double eval_time = + (double)(stats.inference_end_ms - stats.prompt_eval_end_ms); ET_LOG( Info, "\t\tGenerated %" PRIu64 " tokens:\t%f (seconds)\t\t Rate: \t%f (tokens/second)", - num_generated_tokens, - eval_time / SCALING_FACTOR_UNITS_PER_SECOND, - num_generated_tokens / eval_time * SCALING_FACTOR_UNITS_PER_SECOND); + stats.num_generated_tokens, + eval_time / stats.SCALING_FACTOR_UNITS_PER_SECOND, + stats.num_generated_tokens / eval_time * + stats.SCALING_FACTOR_UNITS_PER_SECOND); // Time to first token is measured from the start of inference, excluding // model load time. ET_LOG( Info, "\tTime to first generated token:\t%f (seconds)", - ((double)(first_token_ms - inference_start_ms) / - SCALING_FACTOR_UNITS_PER_SECOND)); + ((double)(stats.first_token_ms - stats.inference_start_ms) / + stats.SCALING_FACTOR_UNITS_PER_SECOND)); ET_LOG( Info, "\tSampling time over %" PRIu64 " tokens:\t%f (seconds)", - num_prompt_tokens + num_generated_tokens, - (double)aggregate_sampling_time_ms / SCALING_FACTOR_UNITS_PER_SECOND); + stats.num_prompt_tokens + stats.num_generated_tokens, + (double)stats.aggregate_sampling_time_ms / + stats.SCALING_FACTOR_UNITS_PER_SECOND); } -const std::string Runner::TimeStamps::toJsonString( - const int64_t& num_prompt_tokens, - const int64_t& num_generated_tokens) { +std::string statsToJsonString(const Runner::Stats& stats) { std::stringstream ss; - ss << "{\"prompt_tokens\":" << num_prompt_tokens << "," - << "\"generated_tokens\":" << num_generated_tokens << "," - << "\"model_load_start_ms\":" << model_load_start_ms << "," - << "\"model_load_end_ms\":" << model_load_end_ms << "," - << "\"inference_start_ms\":" << inference_start_ms << "," - << "\"inference_end_ms\":" << inference_end_ms << "," - << "\"prompt_eval_end_ms\":" << prompt_eval_end_ms << "," - << "\"first_token_ms\":" << first_token_ms << "," - << "\"aggregate_sampling_time_ms\":" << aggregate_sampling_time_ms << "," + ss << "{\"prompt_tokens\":" << stats.num_prompt_tokens << "," + << "\"generated_tokens\":" << stats.num_generated_tokens << "," + << "\"model_load_start_ms\":" << stats.model_load_start_ms << "," + << "\"model_load_end_ms\":" << stats.model_load_end_ms << "," + << "\"inference_start_ms\":" << stats.inference_start_ms << "," + << "\"inference_end_ms\":" << stats.inference_end_ms << "," + << "\"prompt_eval_end_ms\":" << stats.prompt_eval_end_ms << "," + << "\"first_token_ms\":" << stats.first_token_ms << "," + << "\"aggregate_sampling_time_ms\":" << stats.aggregate_sampling_time_ms + << "," << "\"SCALING_FACTOR_UNITS_PER_SECOND\":" - << SCALING_FACTOR_UNITS_PER_SECOND << "}"; + << stats.SCALING_FACTOR_UNITS_PER_SECOND << "}"; return ss.str(); } +} // namespace void Runner::stop() { shouldStop_ = true; diff --git a/examples/models/llama2/runner/runner.h b/examples/models/llama2/runner/runner.h index 34339a7c03..08f5e33c47 100644 --- a/examples/models/llama2/runner/runner.h +++ b/examples/models/llama2/runner/runner.h @@ -31,12 +31,39 @@ class Runner { const std::string& tokenizer_path, const float temperature = 0.8f); + struct Stats { + // Scaling factor for timestamps - in this case, we use ms. + const long SCALING_FACTOR_UNITS_PER_SECOND = 1000; + // Time stamps for the different stages of the execution + // model_load_start_ms: Start of model loading. + long model_load_start_ms; + // model_load_end_ms: End of model loading. + long model_load_end_ms; + // inference_start_ms: Immediately after the model is loaded (or we check + // for model load), measure the inference time. + long inference_start_ms; + // prompt_eval_end_ms: Prompt array allocation and tokenization. Ends right + // before the inference loop starts + long prompt_eval_end_ms; + // first_token: Timestamp when the first generated token is emitted + long first_token_ms; + // inference_end_ms: End of inference/generation. + long inference_end_ms; + // Keep a running total of the time spent in sampling. + long aggregate_sampling_time_ms; + // Token count from prompt + int64_t num_prompt_tokens; + // Token count from generated (total - prompt) + int64_t num_generated_tokens; + }; + bool is_loaded() const; Error load(); Error generate( const std::string& prompt, int32_t seq_len = 128, - std::function callback = {}); + std::function token_callback = {}, + std::function stats_callback = {}); void stop(); private: @@ -68,36 +95,7 @@ class Runner { std::unique_ptr tokenizer_; std::unique_ptr sampler_; bool shouldStop_{false}; - - struct TimeStamps { - // Scaling factor for timestamps - in this case, we use ms. - const long SCALING_FACTOR_UNITS_PER_SECOND = 1000; - // Time stamps for the different stages of the execution - // model_load_start_ms: Start of model loading. - long model_load_start_ms; - // model_load_end_ms: End of model loading. - long model_load_end_ms; - // inference_start_ms: Immediately after the model is loaded (or we check - // for model load), measure the inference time. - long inference_start_ms; - // prompt_eval_end_ms: Prompt array allocation and tokenization. Ends right - // before the inference loop starts - long prompt_eval_end_ms; - // first_token: Timestamp when the first generated token is emitted - long first_token_ms; - // inference_end_ms: End of inference/generation. - long inference_end_ms; - // Keep a running total of the time spent in sampling. - long aggregate_sampling_time_ms; - - void printReport( - const int64_t& num_prompt_tokens, - const int64_t& num_generated_tokens); - const std::string toJsonString( - const int64_t& num_prompt_tokens, - const int64_t& num_generated_tokens); - }; - TimeStamps timers_; + Stats stats_; }; } // namespace torch::executor diff --git a/examples/models/llama2/runner/targets.bzl b/examples/models/llama2/runner/targets.bzl index 15658f8d75..7f91b4a67f 100644 --- a/examples/models/llama2/runner/targets.bzl +++ b/examples/models/llama2/runner/targets.bzl @@ -4,7 +4,7 @@ def _get_operator_lib(aten = False): if aten: return ["//executorch/kernels/aten:generated_lib_aten"] elif runtime.is_oss: - return ["//executorch/kernels/portable:generated_lib"] + return ["//executorch/kernels/portable:generated_lib", "//executorch/examples/models/llama2/custom_ops:custom_ops"] else: return ["//executorch/configurations:optimized_native_cpu_ops", "//executorch/examples/models/llama2/custom_ops:custom_ops", "//executorch/examples/models/llama2/ops:generated_lib"] diff --git a/exir/serde/export_serialize.py b/exir/serde/export_serialize.py index dbad50e1cd..799a1dbe78 100644 --- a/exir/serde/export_serialize.py +++ b/exir/serde/export_serialize.py @@ -1503,7 +1503,7 @@ def deserialize_optional_tensor_args(a): if a.type == "as_none": return None elif a.type == "as_tensor": - return self.serialized_name_to_node[a.value] + return self.serialized_name_to_node[a.value.name] else: raise SerializeError(f"Unhandled argument {inp}") diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp index 9790c7c950..b4fe80f022 100644 --- a/extension/android/jni/jni_layer_llama.cpp +++ b/extension/android/jni/jni_layer_llama.cpp @@ -72,6 +72,18 @@ class ExecuTorchLlamaCallbackJni facebook::jni::local_ref s = facebook::jni::make_jstring(result); method(self(), s); } + + void onStats(const Runner::Stats& result) const { + static auto cls = ExecuTorchLlamaCallbackJni::javaClassStatic(); + static const auto method = cls->getMethod("onStats"); + double eval_time = + (double)(result.inference_end_ms - result.prompt_eval_end_ms); + + float tps = result.num_generated_tokens / eval_time * + result.SCALING_FACTOR_UNITS_PER_SECOND; + + method(self(), tps); + } }; class ExecuTorchLlamaJni @@ -117,9 +129,10 @@ class ExecuTorchLlamaJni facebook::jni::alias_ref prompt, facebook::jni::alias_ref callback) { runner_->generate( - prompt->toStdString(), 128, [callback](std::string result) { - callback->onResult(result); - }); + prompt->toStdString(), + 128, + [callback](std::string result) { callback->onResult(result); }, + [callback](const Runner::Stats& result) { callback->onStats(result); }); return 0; } diff --git a/extension/android/src/main/java/org/pytorch/executorch/LlamaCallback.java b/extension/android/src/main/java/org/pytorch/executorch/LlamaCallback.java index b07f82eedf..33ab928bae 100644 --- a/extension/android/src/main/java/org/pytorch/executorch/LlamaCallback.java +++ b/extension/android/src/main/java/org/pytorch/executorch/LlamaCallback.java @@ -11,7 +11,20 @@ import com.facebook.jni.annotations.DoNotStrip; public interface LlamaCallback { - /** Called when a new result is available from JNI. User should override this method. */ + /** + * Called when a new result is available from JNI. Users will keep getting onResult() invocations + * until generate() finishes. + * + * @param result Last generated token + */ @DoNotStrip public void onResult(String result); + + /** + * Called when the statistics for the generate() is available. + * + * @param tps Tokens/second for generated tokens. + */ + @DoNotStrip + public void onStats(float tps); } diff --git a/extension/android/src/main/java/org/pytorch/executorch/LlamaModule.java b/extension/android/src/main/java/org/pytorch/executorch/LlamaModule.java index f73f550d5a..d3a4c3232a 100644 --- a/extension/android/src/main/java/org/pytorch/executorch/LlamaModule.java +++ b/extension/android/src/main/java/org/pytorch/executorch/LlamaModule.java @@ -27,6 +27,7 @@ public class LlamaModule { private static native HybridData initHybrid( String modulePath, String tokenizerPath, float temperature); + /** Constructs a LLAMA Module for a model with given path, tokenizer, and temperature. */ public LlamaModule(String modulePath, String tokenizerPath, float temperature) { mHybridData = initHybrid(modulePath, tokenizerPath, temperature); } @@ -35,12 +36,20 @@ public void resetNative() { mHybridData.resetNative(); } + /** + * Start generating tokens from the module. + * + * @param prompt Input prompt + * @param llamaCallback callback object to receive results. + */ @DoNotStrip public native int generate(String prompt, LlamaCallback llamaCallback); + /** Stop current generate() before it finishes. */ @DoNotStrip public native void stop(); + /** Force loading the module. Otherwise the model is loaded during first generate(). */ @DoNotStrip public native int load(); }