dbg: temp

Signed-off-by: YdrMaster <ydrml@hotmail.com>
InfiniTensor · Dec 19, 2023 · 60b8259 · 60b8259
1 parent cd02cf0
commit 60b8259
Show file tree

Hide file tree

Showing 3 changed files with 32 additions and 12 deletions.
diff --git a/scripts/compare/compare.py b/scripts/compare/compare.py
@@ -23,6 +23,7 @@ def parse_args():
         args.actual,
     )
 
+
 def getDiff(base, test):
     absolute_diff = np.subtract(base, test)
     max_absolute_diff = np.max(np.abs(absolute_diff))
@@ -35,16 +36,19 @@ def getDiff(base, test):
 
     return max_absolute_diff, max_relative_diff
 
-def compare_npy(actual_path, expect_path, edge, node):
+
+def compare_npy(meta_file, actual_path, expect_path, edge, node):
     actual = np.load(actual_path)
     expect = np.load(expect_path)
     if np.isnan(actual).any():
         print(f"NAN value in node:{node} edge:{edge}")
         return
-    
+
     max_absolute_diff, max_relative_diff = getDiff(expect, actual)
-    if max_absolute_diff != 0.0: ## No need to print tensor with no diff
-        print(f'{max_absolute_diff}\t{max_relative_diff}\t{node}\t{edge}')
+    if max_absolute_diff != 0.0:  ## No need to print tensor with no diff
+        print(
+            f"{max_absolute_diff}\t{max_relative_diff}\t{meta_file}\t{actual_path}\t{expect_path}"
+        )
 
 
 def main():
@@ -71,7 +75,11 @@ def main():
                     expect_file_path = os.path.join(expect_dir, expect_file)
                     if os.path.exists(expect_file_path):
                         compare_npy(
-                            actual_file_path, expect_file_path, edge_name, node_name
+                            meta_file,
+                            actual_file_path,
+                            expect_file_path,
+                            edge_name,
+                            node_name,
                         )
 
 

diff --git a/src/04kernel/src/kernels/dynamic_quantize_linear/cuda_kernel.cu b/src/04kernel/src/kernels/dynamic_quantize_linear/cuda_kernel.cu
@@ -86,11 +86,13 @@ namespace refactor::kernel {
 
         QuantizeMinMax<TI> *nullTyped = nullptr;
         size_t tempStorageBytes = 0;
-        cub::DeviceReduce::Reduce(
+        auto e = cub::DeviceReduce::Reduce(
             nullptr, tempStorageBytes,
             nullTyped, nullTyped, 0,
             QuantizeReduceMinMaxFunctor<TI>{},
             QuantizeMinMax<TI>{});
+        fmt::println("error: {} {}", (int) e, cudaGetErrorString(e));
+        fmt::println("tempStorageBytes: {}", tempStorageBytes);
 
         auto offset0 = workspaceSize;
         workspaceSize += tempStorageBytes;
@@ -117,12 +119,17 @@ namespace refactor::kernel {
                     QuantizeMapMinMaxFunctor<TI>{});
 
                 auto tempStorageSize_ = tempStorageBytes;
-                cub::DeviceReduce::Reduce(
+                fmt::println("tempStorage: {}, tempStorageSize: {}, doubled: {}, minmax: {}, params.n: {}",
+                             (void *) tempStorage, tempStorageSize_, (void *) doubled, (void *) minmax, params.n);
+
+                auto e = cub::DeviceReduce::Reduce(
                     tempStorage, tempStorageSize_,
                     doubled, minmax, params.n,
                     QuantizeReduceMinMaxFunctor<TI>{},
                     QuantizeMinMax<TI>{_MAX, _MIN});
 
+                fmt::println("error: {} {}", (int) e, cudaGetErrorString(e));
+
                 kernel<<<params.gridSize, params.blockSize>>>(
                     params.n, minmax, x, y, scale, zp);
             };

diff --git a/src/04kernel/test/kernels/dynamic_quantize_linear/test_cuda.cpp b/src/04kernel/test/kernels/dynamic_quantize_linear/test_cuda.cpp
@@ -10,7 +10,7 @@ using namespace kernel;
 using namespace hardware;
 
 TEST(kernel, DynamicQuantizeLinearCuda) {
-    auto size = 20;
+    auto size = 4096;
     // build routine
     auto kernel = DynamicQuantizeLinearCuda::build(size),
          kCpu = DynamicQuantizeLinearCpu::build(size);
@@ -26,13 +26,12 @@ TEST(kernel, DynamicQuantizeLinearCuda) {
          zpGpu = dev.malloc(sizeof(uint8_t)),
          workspace = dev.malloc(workspaceSize);
     // put input data
-    std::vector<float> x(size);
+    std::vector<float> x(size, 1);
     std::vector<uint8_t> y(size);
     float scale;
     uint8_t zeroPoint;
-    for (auto i : range0_(size)) {
-        x[i] = i * 3 + 15;
-    }
+    x[0] = 1.7181609;
+    x[1] = 0.00011457229;
     xGpu->copyFromHost(x.data());
     // inference
     {
@@ -50,6 +49,12 @@ TEST(kernel, DynamicQuantizeLinearCuda) {
         std::vector<uint8_t> result(size);
         yGpu->copyToHost(result.data());
         EXPECT_EQ(result, y);
+        float scale_;
+        scaleGpu->copyToHost(&scale_);
+        EXPECT_EQ(scale_, scale);
+        uint8_t zp_;
+        zpGpu->copyToHost(&zp_);
+        EXPECT_EQ(zp_, zeroPoint);
     }
 }