From ea79d5fb47eb274e2f8a73fd2ddb3303416a8542 Mon Sep 17 00:00:00 2001 From: Dmitry Zolotukhin Date: Sun, 28 Jan 2024 20:46:01 +0100 Subject: [PATCH] Improve device scoring. Improve searching for the right buffer. Keep trying all heaps when attempting to allocate memory. Fixed incorrect type in shader. Fixed buffer allocation errors. Added a few more tests for sending the matrix and vectors. Switched how the matrix is represented in memory. --- .../shaders/cross_check_filter.comp.glsl | 1 + .../shaders/cross_check_filter.spv | Bin 1604 -> 1796 bytes .../shaders/init_out_data.comp.glsl | 3 +- src/correlation/shaders/init_out_data.spv | Bin 2060 -> 2308 bytes src/correlation/vk.rs | 140 ++++++++---------- 5 files changed, 64 insertions(+), 80 deletions(-) diff --git a/src/correlation/shaders/cross_check_filter.comp.glsl b/src/correlation/shaders/cross_check_filter.comp.glsl index 8f8fca5..4eb2803 100644 --- a/src/correlation/shaders/cross_check_filter.comp.glsl +++ b/src/correlation/shaders/cross_check_filter.comp.glsl @@ -38,4 +38,5 @@ void main() { // TODO: remove this debug code img1[0] = ivec2(img1_width, img1_height); + img1[1] = ivec2(img2_width, img2_height); } diff --git a/src/correlation/shaders/cross_check_filter.spv b/src/correlation/shaders/cross_check_filter.spv index d5c137f74d55c2e7160e40a703e75f0d46aded62..a969a8d11c54888b14cd24745d24a0bb7d0ec18d 100644 GIT binary patch delta 428 zcmYk2O-=$q5QX2&AYu?Bi6%$_86_-K@DCMa#DoHDLkGE`u$l7>Wo4VY`^Uw1aHG*h_>^PqMt^=n-}gQi%G7+^t&tJ<2h(!xh#!bt9r{JX#U7uZ?{?{umCS z$1;1wB6#-e((1P~NtRRg@oas_!NW{w=mu9{e-Q?~4qmLeMd#pwW6d48{a7;~xA)3D fUgRF!$Np61b6t=;zzH;=3omSI&xhAM;|Tr$2{jr! delta 229 zcmZqSJHo@u%%sfDz`)4B&A`E6w2?QSDN%=kfq@N(Ie^%MfrUXANP~a`5PLAN!uiTz zQ3eKg24=7rGm!5NRId-Dm4KKLiLZ#nH{5)d=`v%aEKnQBDp4SogBk(iD*%nq1JX=D t46@o9$_E(>64M6qLC!M(VgVr50rEJ3v>{Z?V6!BvGb6`ep#2s=3;+Y<4>|w< diff --git a/src/correlation/shaders/init_out_data.comp.glsl b/src/correlation/shaders/init_out_data.comp.glsl index 138da8d..911898a 100644 --- a/src/correlation/shaders/init_out_data.comp.glsl +++ b/src/correlation/shaders/init_out_data.comp.glsl @@ -45,7 +45,7 @@ layout(std430, set = 0, binding = 3) buffer Internals_Int { // Layout: // Contains [min, max, neighbor_count] for the corridor range - int internals_int[]; + ivec3 internals_int[]; }; layout(std430, set = 0, binding = 4) buffer Result_Matches { @@ -69,4 +69,5 @@ void main() { // TODO: remove this debug code result_corr[0] = threshold; + result_corr[1] = (fundamental_matrix * vec3(10.0, 5.0, 2.0)).x; } diff --git a/src/correlation/shaders/init_out_data.spv b/src/correlation/shaders/init_out_data.spv index 0ed0416a34ffe52ee78998cba345f6126abb6b9d..e8ee87b814ffce6583b16927954b8315626f6eb0 100644 GIT binary patch literal 2308 zcmZwHM{iXz5Cvc-FQK>4Yj`2_K^Xp5QD4WZM^1O`4OqmQ*!maE4^6IPA#c$u&7O!5rC}LX~bb(^F zCuTZ+D)#yP$1fH($99Cr&2)3vE~PAowayOrcu)MgnD~{b?Tt6)$1XX8ji(y!J0%Gn=R_YpVQa~tAooPES?X*fAJ_o#99!|iT3_rN)0jmrn_IQL!2 z?)J4`dtO-HFjn($a@##O28(+Ut8trp?sdcYzT)0AocrM3Hr&>pd)IK=dTy=Zw)fnJ zhI1}@pBm14tqm1RM2RRV?3-j~jIJsd6p8eRtTq8}zTo|I?o? zH@cPadB!n=j;ngNLbiG^J#|9wc6_UMC)jAuhpT$`GH>-@dTNZ`gLr!NJx7@vt==fO z>OIW7)r0BP^~aghQzvTSNxaeO;cEPH=B*w~Pi;Hjvv_rLAhs)~o{XPMwtVwFiLu`> zW(w~et@*X!JQ zXT$qGbe$JI+H>`M!(iV={9M0A13n(_Zg7o1nR$D^Q^D#){ONf6IivV-UO*WxReTOC zKSvdx?{FEN%Z+@C#n?hjoHg&EziD|7eGA*~YP|E|+z-~fIC=WE=IL2j-pzP1IC=1` zo|C6PYo0!Y<=yRLaPnZiNXz@)kM}m-la9W`Irk_Yrn3|;XB49YKaAn%Jc_3S)6tJO zI{FT#^E6%zjt;E%aK5h^S&8{C_`creS!z5@ig_I_vI_52QMW` z?f9?jJ^gmC-`9T~e?N-y;Tpd)bMf}C@w+m|i>D8ZpNl^hQ@6vIcyBWw!!PAt>hwfR zFX}f&2Pcls>G(4-I;UcE)alt6{zqce--Vu2hcCm~>tfHx{oU_U=5;?kVLzPM-(?~- HHBtTmW0F(* delta 683 zcmYk2Jx;?w6ojAGPGW>CSqKRzKtQDM7YL-_Cl0mKQc_XFCEQRUI-h_eAdbLsI0F*% zIlEz3dUH_wU)6zw>@gX9yV#&o#$}UVtcr-KhqrBgRv)1o;&|4vhSqXm0%=lQ?;j= zn8Ev6IacR2aFxfz3WtsegN~%Y=*H^oz@Y<^R?v}@*G-K`AP`>qB{+m&VhO^j=AHJT zKY8BL?KLR6B_TLIgCxMX-l^jNhbx#gmQT#OR4d_9H5OV^pR3@i_G9pq|5_q-Vt?iJ BB%A;M diff --git a/src/correlation/vk.rs b/src/correlation/vk.rs index ed81ce7..6aee1ec 100644 --- a/src/correlation/vk.rs +++ b/src/correlation/vk.rs @@ -1,4 +1,4 @@ -use std::{collections::HashMap, error, ffi::CStr, fmt, slice, time::SystemTime}; +use std::{cmp::Ordering, collections::HashMap, error, ffi::CStr, fmt, slice, time::SystemTime}; use ash::{prelude::VkResult, vk}; use nalgebra::Matrix3; @@ -33,7 +33,7 @@ struct ShaderParams { out_height: u32, scale: f32, iteration_pass: u32, - fundamental_matrix: [f32; 3 * 4], // matrices are column-major and each column is aligned to 4-component vectors; should be aligned to 16 bytes + fundamental_matrix: [f32; 3 * 4], // matrices are stored row-by-row and each row is aligned to 4-component vectors; should be aligned to 16 bytes corridor_offset: i32, corridor_start: u32, corridor_end: u32, @@ -169,6 +169,9 @@ impl GpuContext { } let correlation_values = Grid::new(img1_dimensions.0, img1_dimensions.1, None); + // TODO: remove this debug code + let mut fundamental_matrix = Matrix3::zeros(); + fundamental_matrix[(2, 0)] = 9.8765; let params = CorrelationParameters::for_projection(&projection_mode); let result = GpuContext { min_stdev: params.min_stdev, @@ -426,7 +429,7 @@ impl GpuContext { let mut f = [0f32; 3 * 4]; for row in 0..3 { for col in 0..3 { - f[col * 4 + row] = fundamental_matrix[(row, col)] as f32; + f[row * 4 + col] = fundamental_matrix[(row, col)] as f32; } } f @@ -813,7 +816,11 @@ impl Device { }); } // TODO: remove this debug code - println!("Corr check = {:?}", out_image.val(0, 0)); + println!( + "Corr check = {:?} {:?}", + out_image.val(0, 0), + out_image.val(1, 0) + ); if !buffer.host_coherent { let flush_memory_ranges = vk::MappedMemoryRange::builder() @@ -906,7 +913,7 @@ impl Device { unsafe fn find_device( instance: &ash::Instance, max_buffer_size: usize, - ) -> Result<(vk::PhysicalDevice, &'static str, u32), Box> { + ) -> Result<(vk::PhysicalDevice, String, u32), Box> { let devices = instance.enumerate_physical_devices()?; let device = devices .iter() @@ -923,16 +930,7 @@ impl Device { let queue_index = Device::find_compute_queue(instance, device)?; let device_name = CStr::from_ptr(props.device_name.as_ptr()); - let device_name = device_name.to_str().unwrap(); - println!( - "Device {} type {} {}-{}-{}-{}", - device_name, - props.device_type.as_raw(), - props.limits.max_push_constants_size, - props.limits.max_bound_descriptor_sets, - props.limits.max_storage_buffer_range, - max_buffer_size - ); + let device_name = String::from_utf8_lossy(device_name.to_bytes()).to_string(); // TODO: allow to specify a device name filter/regex? let score = match props.device_type { vk::PhysicalDeviceType::DISCRETE_GPU => 3, @@ -941,24 +939,25 @@ impl Device { _ => 0, }; // Prefer real devices instead of dzn emulation. - let dzn_multiplier = if device_name + let is_dzn = device_name .to_lowercase() - .starts_with("microsoft direct3d12") - { - 1 - } else { - 10 - }; - Some((device, device_name, queue_index, score * dzn_multiplier)) + .starts_with("microsoft direct3d12"); + let score = (score, is_dzn); + Some((device, device_name, queue_index, score)) }) - .max_by_key(|(_device, _name, _queue_index, score)| *score); - let (device, name, queue_index) = if let Some((device, name, queue_index, _score)) = device - { + .max_by(|(_, _, _, a), (_, _, _, b)| { + if a.1 && !b.1 { + return Ordering::Less; + } else if !a.1 && b.1 { + return Ordering::Greater; + } + return a.0.cmp(&b.0); + }); + let (device, name, queue_index) = if let Some((device, name, queue_index, score)) = device { (device, name, queue_index) } else { return Err(GpuError::new("Device not found").into()); }; - println!("selected device {}", name); Ok((device, name, queue_index)) } @@ -1095,13 +1094,13 @@ impl Device { buffer_type: BufferType, ) -> Result> { let size = size as u64; - let gpu_local = match buffer_type { - BufferType::GpuOnly | BufferType::GpuDestination | BufferType::GpuSource => true, - BufferType::HostSource | BufferType::HostDestination => false, - }; - let host_visible = match buffer_type { - BufferType::HostSource | BufferType::HostDestination => true, - BufferType::GpuOnly | BufferType::GpuDestination | BufferType::GpuSource => false, + let required_memory_properties = match buffer_type { + BufferType::GpuOnly | BufferType::GpuDestination | BufferType::GpuSource => { + vk::MemoryPropertyFlags::DEVICE_LOCAL + } + BufferType::HostSource | BufferType::HostDestination => { + vk::MemoryPropertyFlags::HOST_VISIBLE + } }; let extra_usage_flags = match buffer_type { BufferType::HostSource => vk::BufferUsageFlags::TRANSFER_SRC, @@ -1122,58 +1121,41 @@ impl Device { }; let buffer = device.create_buffer(&buffer_create_info, None)?; let memory_requirements = device.get_buffer_memory_requirements(buffer); - let memory_type_index = memory_properties.memory_types - [..memory_properties.memory_type_count as usize] - .iter() - .enumerate() - .find(|(memory_type_index, memory_type)| { + let buffer_memory = (0..memory_properties.memory_type_count as usize) + .flat_map(|i| { + let memory_type = memory_properties.memory_types[i]; if memory_properties.memory_heaps[memory_type.heap_index as usize].size < memory_requirements.size { - return false; - }; - if (1 << memory_type_index) & memory_requirements.memory_type_bits == 0 { - return false; + return None; } - - if gpu_local - && memory_type - .property_flags - .contains(vk::MemoryPropertyFlags::DEVICE_LOCAL) - { - return true; + if ((1 << i) & memory_requirements.memory_type_bits) == 0 { + return None; } - if host_visible - && memory_type - .property_flags - .contains(vk::MemoryPropertyFlags::HOST_VISIBLE) - { - return true; + let property_flags = memory_type.property_flags; + if !property_flags.contains(required_memory_properties) { + return None; } - false - }); - let memory_type_index = if let Some((index, _)) = memory_type_index { - index as u32 + let host_visible = property_flags.contains(vk::MemoryPropertyFlags::HOST_VISIBLE); + let host_coherent = property_flags.contains(vk::MemoryPropertyFlags::HOST_COHERENT); + let allocate_info = vk::MemoryAllocateInfo { + allocation_size: memory_requirements.size, + memory_type_index: i as u32, + ..Default::default() + }; + // Some buffers may fill up, in this case allocating memory can fail. + let mem = device.allocate_memory(&allocate_info, None).ok()?; + + Some((mem, host_visible, host_coherent)) + }) + .next(); + + let (buffer_memory, host_visible, host_coherent) = if let Some(mem) = buffer_memory { + mem } else { + device.destroy_buffer(buffer, None); return Err(GpuError::new("Cannot find suitable memory").into()); }; - let property_flags = - memory_properties.memory_types[memory_type_index as usize].property_flags; - let host_visible = property_flags.contains(vk::MemoryPropertyFlags::HOST_VISIBLE); - let host_coherent = property_flags.contains(vk::MemoryPropertyFlags::HOST_COHERENT); - let allocate_info = vk::MemoryAllocateInfo { - allocation_size: memory_requirements.size, - memory_type_index, - ..Default::default() - }; - let buffer_memory = device.allocate_memory(&allocate_info, None); - let buffer_memory = match buffer_memory { - Ok(mem) => mem, - Err(err) => { - device.destroy_buffer(buffer, None); - return Err(err.into()); - } - }; let result = Buffer { buffer, buffer_memory, @@ -1207,10 +1189,10 @@ impl Device { }; let descriptor_pool_size = [vk::DescriptorPoolSize::builder() .ty(vk::DescriptorType::STORAGE_BUFFER) - .descriptor_count(2) + .descriptor_count(6) .build()]; let descriptor_pool_info = vk::DescriptorPoolCreateInfo::builder() - .max_sets(1) + .max_sets(2) .pool_sizes(&descriptor_pool_size); let descriptor_pool = device.create_descriptor_pool(&descriptor_pool_info, None)?; let cleanup_err = |err| {