From 5700e0e1d3e35b1792fb81ed80bb11ee65d7b2d2 Mon Sep 17 00:00:00 2001 From: Tomasz Krupa Date: Tue, 21 Jan 2025 10:35:56 +0000 Subject: [PATCH 01/13] [GPU] Fix weightless caching with int4 models --- .../src/transformations/convert_precision.cpp | 8 +- .../rt_info/weightless_caching_attributes.hpp | 3 + .../op/util/weightless_caching_attributes.cpp | 11 + src/core/src/pass/constant_folding.cpp | 2 + .../include/intel_gpu/primitives/data.hpp | 253 ++++++++++++------ .../graph_optimizer/propagate_constants.cpp | 20 +- .../convert_fc_to_compressed.cpp | 36 ++- 7 files changed, 213 insertions(+), 120 deletions(-) diff --git a/src/common/transformations/src/transformations/convert_precision.cpp b/src/common/transformations/src/transformations/convert_precision.cpp index fb0fe36a80cd2e..c829069738d9d0 100644 --- a/src/common/transformations/src/transformations/convert_precision.cpp +++ b/src/common/transformations/src/transformations/convert_precision.cpp @@ -1417,13 +1417,7 @@ bool fuse_type_to_constant(const std::shared_ptr& node, new_const->validate_and_infer_types(); new_const->set_friendly_name(constant->get_friendly_name()); ov::copy_runtime_info(constant, new_const); - - const auto& rt_info = node->get_rt_info(); - auto weightless_caching_attr = rt_info.find(ov::WeightlessCacheAttribute::get_type_info_static()); - if (weightless_caching_attr != rt_info.end()) { - new_const->get_rt_info()[ov::WeightlessCacheAttribute::get_type_info_static()] = - weightless_caching_attr->second; - } + ov::copy_weightless_cache_attr(constant, new_const); return true; } return false; diff --git a/src/core/dev_api/openvino/core/rt_info/weightless_caching_attributes.hpp b/src/core/dev_api/openvino/core/rt_info/weightless_caching_attributes.hpp index 49f67f91e5cbde..b7c6d538f6cc08 100644 --- a/src/core/dev_api/openvino/core/rt_info/weightless_caching_attributes.hpp +++ b/src/core/dev_api/openvino/core/rt_info/weightless_caching_attributes.hpp @@ -7,9 +7,12 @@ #include "openvino/core/core_visibility.hpp" #include "openvino/core/node.hpp" #include "openvino/core/runtime_attribute.hpp" +#include "transformations_visibility.hpp" namespace ov { +TRANSFORMATIONS_API void copy_weightless_cache_attr(const std::shared_ptr& from, const std::shared_ptr& to); + /** * @brief Holds weightless caching attributes of a single constant. * diff --git a/src/core/src/op/util/weightless_caching_attributes.cpp b/src/core/src/op/util/weightless_caching_attributes.cpp index 7c540f8a3bef02..c08096a31109b9 100644 --- a/src/core/src/op/util/weightless_caching_attributes.cpp +++ b/src/core/src/op/util/weightless_caching_attributes.cpp @@ -7,3 +7,14 @@ bool ov::WeightlessCacheAttribute::is_copyable() const { return false; } + +TRANSFORMATIONS_API void ov::copy_weightless_cache_attr(const std::shared_ptr& from, + const std::shared_ptr& to) { + const auto& rt_info = from->get_rt_info(); + auto weightless_caching_attr = rt_info.find(ov::WeightlessCacheAttribute::get_type_info_static()); + + if (weightless_caching_attr != rt_info.end()) { + to->get_rt_info()[ov::WeightlessCacheAttribute::get_type_info_static()] = + weightless_caching_attr->second; + } +} \ No newline at end of file diff --git a/src/core/src/pass/constant_folding.cpp b/src/core/src/pass/constant_folding.cpp index e55abd0fb251ea..ae8d74e737ccf0 100644 --- a/src/core/src/pass/constant_folding.cpp +++ b/src/core/src/pass/constant_folding.cpp @@ -7,6 +7,7 @@ #include "openvino/cc/pass/itt.hpp" #include "openvino/core/constant_fold_utils.hpp" #include "openvino/core/rt_info.hpp" +#include "openvino/core/rt_info/weightless_caching_attributes.hpp" #include "openvino/op/constant.hpp" #include "openvino/op/convert.hpp" #include "openvino/op/util/op_types.hpp" @@ -153,6 +154,7 @@ bool ov::pass::ConstantFolding::run_on_model(const std::shared_ptr& m copy_runtime_info_from_input_values(original_node); // Propagate runtime info attributes to replacement copy_runtime_info(original_node, replacement_ptr); + ov::copy_weightless_cache_attr(original_node, replacement_ptr); rewritten = true; } diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/data.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/data.hpp index a09401af24d043..63de4f3e02cbe9 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/primitives/data.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/data.hpp @@ -3,9 +3,12 @@ // #pragma once -#include #include +#include +#include "intel_gpu/graph/network.hpp" +#include "intel_gpu/primitives/input_layout.hpp" +#include "intel_gpu/primitives/reorder.hpp" #include "intel_gpu/runtime/engine.hpp" #include "intel_gpu/runtime/memory.hpp" #include "openvino/op/constant.hpp" @@ -16,11 +19,70 @@ #include "primitive.hpp" #include "transformations/convert_precision.hpp" -namespace cldnn { +namespace { + +struct data_mem_wrapper { + cldnn::memory::ptr mem_ptr = nullptr; + cldnn::allocation_type mem_ptr_alloc_type = cldnn::allocation_type::unknown; + cldnn::layout output_layout{}; + size_t data_size = 0; +}; + +class MemoryManager { +public: + MemoryManager(data_mem_wrapper memory_info, + std::shared_ptr mapped_weights, + size_t bin_offset, + size_t original_size) + : memory_info(memory_info) { + shared_buf = + std::make_shared>>(mapped_weights->data() + bin_offset, + original_size, + mapped_weights); + } + + void copy_to_mem(cldnn::engine& engine) { + OPENVINO_ASSERT(memory_info.mem_ptr_alloc_type != cldnn::allocation_type::unknown); + OPENVINO_ASSERT(!copied); -struct weights_mem { + if (memory_info.mem_ptr_alloc_type == cldnn::allocation_type::usm_host || + memory_info.mem_ptr_alloc_type == cldnn::allocation_type::usm_shared) { + std::memcpy(reinterpret_cast(memory_info.mem_ptr->buffer_ptr()), + get_loaded_data(), + memory_info.data_size); + } else { + auto& strm = engine.get_service_stream(); + auto data_ptr = get_loaded_data(); + memory_info.mem_ptr->copy_from(strm, data_ptr); + } + copied = true; + } + + void set_mem(cldnn::memory::ptr mem_ptr) { + memory_info.mem_ptr = mem_ptr; + } + + bool is_copied() { + return copied; + } + + std::shared_ptr>> get_shared_buf() { + return shared_buf; + } + + cldnn::memory::ptr get_mem_ptr() { + return memory_info.mem_ptr; + } + + void set_transformed_constant(std::shared_ptr constant) { + transformed_constant = constant; + } + +private: std::shared_ptr>> shared_buf = nullptr; std::shared_ptr transformed_constant = nullptr; + data_mem_wrapper memory_info{}; + bool copied = false; const uint8_t* get_loaded_data() { if (transformed_constant) { @@ -31,6 +93,16 @@ struct weights_mem { } }; +} // namespace + +namespace cldnn { + +struct reorder_replication { + bool do_reorder = false; + cldnn::layout input_layout = {}; + cldnn::layout output_layout = {}; +}; + struct weightless_cache_manager { void set_constant_info(size_t bin_offset, size_t original_size, @@ -53,6 +125,12 @@ struct weightless_cache_manager { do_weightless_caching = false; } + void apply_reorder(layout input_layout, layout output_layout) { + reorder_rep.do_reorder = true; + reorder_rep.input_layout = input_layout; + reorder_rep.output_layout = output_layout; + } + void set_new_dtype(ov::element::Type curr_dtype) { this->curr_dtype = curr_dtype; do_precision_conversion = original_dtype != curr_dtype; @@ -76,15 +154,20 @@ struct weightless_cache_manager { ob << make_data(&num_dims, sizeof(size_t)); ob << make_data(shape.data(), num_dims * sizeof(ov::Shape::value_type)); } + if (reorder_rep.do_reorder) { + ob << true; + ob << reorder_rep.input_layout; + ob << reorder_rep.output_layout; + } else { + ob << false; + } return true; } - std::shared_ptr load(BinaryInputBuffer& ib, - std::shared_ptr mapped_weights, - size_t data_size) { + bool load(BinaryInputBuffer& ib, data_mem_wrapper& mem_info, std::shared_ptr mapped_weights) { ib >> do_weightless_caching; if (!do_weightless_caching) { - return nullptr; + return false; } OPENVINO_ASSERT(mapped_weights != nullptr, "mmap object is null"); @@ -101,24 +184,29 @@ struct weightless_cache_manager { shape.resize(num_dims); ib >> make_data(shape.data(), num_dims * sizeof(ov::Shape::value_type)); } else { - original_size = data_size; + original_size = mem_info.data_size; + } + + ib >> reorder_rep.do_reorder; + if (reorder_rep.do_reorder) { + ib >> reorder_rep.input_layout; + ib >> reorder_rep.output_layout; } - auto mem_obj = std::make_shared(); - mem_obj->shared_buf = std::make_shared>>( - mapped_weights->data() + bin_offset, - original_size, - mapped_weights); + auto mem_obj = std::make_shared(mem_info, mapped_weights, bin_offset, original_size); if (should_run_transformations()) { - run_transformations(mem_obj); + run_transformations(ib.get_engine(), mem_obj); + } else { + mem_obj->copy_to_mem(ib.get_engine()); } - return mem_obj; + return true; } private: bool do_weightless_caching = false; bool do_precision_conversion = false; + reorder_replication reorder_rep{}; size_t bin_offset = SIZE_MAX; size_t original_size = SIZE_MAX; @@ -127,14 +215,14 @@ struct weightless_cache_manager { ov::Shape shape; bool should_run_transformations() { - return do_precision_conversion; + return do_precision_conversion || reorder_rep.do_reorder; } - void run_transformations(std::shared_ptr mem_obj) { + void run_transformations(engine& engine, std::shared_ptr mem_obj) { auto orig_constant = std::make_shared(original_dtype, shape, - mem_obj->shared_buf->get_ptr(), - mem_obj->shared_buf); + mem_obj->get_shared_buf()->get_ptr(), + mem_obj->get_shared_buf()); ov::ParameterVector inputParams; ov::ResultVector results; @@ -144,8 +232,7 @@ struct weightless_cache_manager { ov::pass::Manager manager("Plugin:GPU:weightless_cache_transformations"); if (do_precision_conversion) { - precisions_map fp_convert_precision_map = { - {original_dtype, curr_dtype}}; + precisions_map fp_convert_precision_map = {{original_dtype, curr_dtype}}; type_to_fuse_map empty_fuse_map = {}; const bool keep_precision_sensitive_in_fp32 = false; const bool convert_input_output_precision = false; @@ -163,8 +250,26 @@ struct weightless_cache_manager { return ov::op::util::is_constant(node); }); OPENVINO_ASSERT(it != ops.end()); - mem_obj->transformed_constant = ov::as_type_ptr(*it); - OPENVINO_ASSERT(mem_obj->transformed_constant->get_element_type() == curr_dtype); + auto transformed_constant = ov::as_type_ptr(*it); + OPENVINO_ASSERT(transformed_constant->get_element_type() == curr_dtype); + mem_obj->set_transformed_constant(transformed_constant); + mem_obj->copy_to_mem(engine); + + if (reorder_rep.do_reorder) { + OPENVINO_ASSERT(reorder_rep.input_layout == mem_obj->get_mem_ptr()->get_layout()); + topology topology(input_layout("input", reorder_rep.input_layout), + reorder("reorder", input_info("input"), reorder_rep.output_layout)); + ExecutionConfig config{}; + ov::intel_gpu::ImplementationDesc reorder_ref = {reorder_rep.output_layout.format, "reorder_data"}; + cldnn::network network(engine, topology, config); + memory::ptr input_mem = mem_obj->get_mem_ptr(); + network.set_input_data("input", input_mem); + auto outputs = network.execute(); + OPENVINO_ASSERT(outputs.size() == 1); + memory::ptr output_mem = outputs.begin()->second.get_memory(); + OPENVINO_ASSERT(input_mem->size() == output_mem->size()); + mem_obj->set_mem(output_mem); + } } }; @@ -249,73 +354,57 @@ struct data : public primitive_base { mem = ib.get_engine().allocate_memory(output_layout, _allocation_type, false); - auto mem_obj = cache_info->load(ib, mapped_weights, data_size); - bool is_weightless_caching_enabled = mem_obj != nullptr; + data_mem_wrapper mem_info{mem, _allocation_type, output_layout, data_size}; + bool is_weightless_caching = cache_info->load(ib, mem_info, mapped_weights); - if (_allocation_type == allocation_type::usm_host || _allocation_type == allocation_type::usm_shared) { - if (is_weightless_caching_enabled) { - std::memcpy(reinterpret_cast(mem->buffer_ptr()), mem_obj->get_loaded_data(), data_size); - } else { - ib >> make_data(mem->buffer_ptr(), data_size); - } + if (is_weightless_caching) { + mem = mem_info.mem_ptr; } else { - const size_t DATA_BLOCK_SIZE = 2 * 1024 * 1024; - auto& strm = ib.get_engine().get_service_stream(); - if (data_size < DATA_BLOCK_SIZE || output_layout.format.is_image_2d()) { - std::vector _buf(data_size); - if (is_weightless_caching_enabled) { - std::memcpy(reinterpret_cast(_buf.data()), mem_obj->get_loaded_data(), data_size); - } else { - ib >> make_data(_buf.data(), data_size); - } - mem->copy_from(strm, _buf.data()); + if (_allocation_type == allocation_type::usm_host || _allocation_type == allocation_type::usm_shared) { + ib >> make_data(mem->buffer_ptr(), data_size); } else { - std::vector _buf1(DATA_BLOCK_SIZE); - std::vector _buf2(DATA_BLOCK_SIZE); - bool buf_flag = true; - event::ptr ev1, ev2; - ev1 = ev2 = nullptr; - size_t dst_offset = 0; - while (dst_offset < data_size) { - const bool is_blocking = false; - const size_t src_offset = 0; - size_t copy_size = - (data_size > (dst_offset + DATA_BLOCK_SIZE)) ? DATA_BLOCK_SIZE : (data_size - dst_offset); - if (buf_flag) { - if (is_weightless_caching_enabled) { - std::memcpy(reinterpret_cast(_buf1.data()), - mem_obj->get_loaded_data() + dst_offset, - copy_size); - } else { + const size_t DATA_BLOCK_SIZE = 2 * 1024 * 1024; + auto& strm = ib.get_engine().get_service_stream(); + if (data_size < DATA_BLOCK_SIZE || output_layout.format.is_image_2d()) { + std::vector _buf(data_size); + ib >> make_data(_buf.data(), data_size); + mem->copy_from(strm, _buf.data()); + } else { + std::vector _buf1(DATA_BLOCK_SIZE); + std::vector _buf2(DATA_BLOCK_SIZE); + bool buf_flag = true; + event::ptr ev1, ev2; + ev1 = ev2 = nullptr; + size_t dst_offset = 0; + while (dst_offset < data_size) { + const bool is_blocking = false; + const size_t src_offset = 0; + size_t copy_size = + (data_size > (dst_offset + DATA_BLOCK_SIZE)) ? DATA_BLOCK_SIZE : (data_size - dst_offset); + if (buf_flag) { ib >> make_data(_buf1.data(), copy_size); - } - if (ev2 != nullptr) { - ev2->wait(); - ev2 = nullptr; - } - ev1 = mem->copy_from(strm, _buf1.data(), src_offset, dst_offset, copy_size, is_blocking); - } else { - if (is_weightless_caching_enabled) { - std::memcpy(reinterpret_cast(_buf2.data()), - mem_obj->get_loaded_data() + dst_offset, - copy_size); + if (ev2 != nullptr) { + ev2->wait(); + ev2 = nullptr; + } + ev1 = mem->copy_from(strm, _buf1.data(), src_offset, dst_offset, copy_size, is_blocking); } else { ib >> make_data(_buf2.data(), copy_size); + if (ev1 != nullptr) { + ev1->wait(); + ev1 = nullptr; + } + ev2 = mem->copy_from(strm, _buf2.data(), src_offset, dst_offset, copy_size, is_blocking); } - if (ev1 != nullptr) { - ev1->wait(); - ev1 = nullptr; - } - ev2 = mem->copy_from(strm, _buf2.data(), src_offset, dst_offset, copy_size, is_blocking); + dst_offset += DATA_BLOCK_SIZE; + buf_flag = !buf_flag; + } + if (ev2 != nullptr) { + ev2->wait(); + } + if (ev1 != nullptr) { + ev1->wait(); } - dst_offset += DATA_BLOCK_SIZE; - buf_flag = !buf_flag; - } - if (ev2 != nullptr) { - ev2->wait(); - } - if (ev1 != nullptr) { - ev1->wait(); } } } diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/propagate_constants.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/propagate_constants.cpp index 34fa9647ec99c3..03e9d715b62e8c 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/propagate_constants.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/propagate_constants.cpp @@ -95,23 +95,9 @@ void propagate_constants::run(program& p) { } } - auto is_reorder_with_only_dtype_change = [&](program_node& dst) { - if (!in_layout) { - return false; - } - auto& dst_layout = dst.get_output_layout(); - if (in_layout->data_type == dst_layout.data_type) { - return false; - } - - auto aux_layout = dst_layout; - aux_layout.data_type = in_layout->data_type; - return aux_layout == *in_layout.get(); - }; - if (is_reorder_with_only_dtype_change(new_node)) { - new_node.as().get_primitive()->cache_info->set_new_dtype(new_node.get_output_layout().data_type); - } else { - new_node.as().get_primitive()->cache_info->invalidate(); + if (*in_layout.get() != new_node.get_output_layout()) { + new_node.as().get_primitive()->cache_info->apply_reorder(*in_layout.get(), + new_node.get_output_layout()); } curr_node.dependencies.clear(); diff --git a/src/plugins/intel_gpu/src/plugin/transformations/convert_fc_to_compressed.cpp b/src/plugins/intel_gpu/src/plugin/transformations/convert_fc_to_compressed.cpp index f5062b4c2028cc..eb2277cd185998 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/convert_fc_to_compressed.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/convert_fc_to_compressed.cpp @@ -3,22 +3,23 @@ // #include "convert_fc_to_compressed.hpp" + #include #include "intel_gpu/op/fully_connected.hpp" #include "intel_gpu/op/fully_connected_compressed.hpp" - +#include "openvino/core/rt_info.hpp" +#include "openvino/core/rt_info/weightless_caching_attributes.hpp" #include "openvino/op/constant.hpp" -#include "openvino/op/subtract.hpp" +#include "openvino/op/convert.hpp" #include "openvino/op/matmul.hpp" #include "openvino/op/multiply.hpp" -#include "openvino/op/convert.hpp" -#include "openvino/op/transpose.hpp" #include "openvino/op/reshape.hpp" -#include "openvino/core/rt_info.hpp" +#include "openvino/op/subtract.hpp" +#include "openvino/op/transpose.hpp" +#include "openvino/pass/pattern/op/or.hpp" #include "openvino/pass/pattern/op/pattern.hpp" #include "openvino/pass/pattern/op/wrap_type.hpp" -#include "openvino/pass/pattern/op/or.hpp" #include "transformations/utils/utils.hpp" namespace ov { @@ -103,20 +104,27 @@ ConvertFullyConnectedToFullyConnectedCompressed::ConvertFullyConnectedToFullyCon auto new_shape = (has_transpose || !grouped) ? ov::Shape{current_shape[0] * current_shape[1], current_shape[2]} : ov::Shape{current_shape[0], current_shape[1] * current_shape[2]}; - return std::make_shared(*constant, new_shape); + auto new_constant = std::make_shared(*constant, new_shape); + + ov::copy_weightless_cache_attr(constant, new_constant); + return new_constant; }; auto convert_const_to_u8 = [&](std::shared_ptr node) { auto constant = ov::as_type_ptr(node); + std::shared_ptr result = nullptr; // Convert ZP to u8 if (constant->get_element_type() == ov::element::u8) - return std::dynamic_pointer_cast(constant); - if (constant->get_element_type() == ov::element::u4) - return std::dynamic_pointer_cast(std::make_shared(node, ov::element::u8)); - if (weight_u8 && sub_with_convert) - return std::dynamic_pointer_cast(std::make_shared(node, ov::element::u8)); - - return std::dynamic_pointer_cast(constant); + result = ov::as_type_ptr(constant); + else if (constant->get_element_type() == ov::element::u4) + result = ov::as_type_ptr(std::make_shared(node, ov::element::u8)); + else if (weight_u8 && sub_with_convert) + result = ov::as_type_ptr(std::make_shared(node, ov::element::u8)); + else + result = ov::as_type_ptr(constant); + + ov::copy_weightless_cache_attr(node, result); + return result; }; From 4ee643713c5a7da663f751ac42546d0f80b43cc6 Mon Sep 17 00:00:00 2001 From: Tomasz Krupa Date: Thu, 23 Jan 2025 13:00:12 +0000 Subject: [PATCH 02/13] Bring back std::dynamic_pointer_cast for ov::Node --- .../plugin/transformations/convert_fc_to_compressed.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/plugins/intel_gpu/src/plugin/transformations/convert_fc_to_compressed.cpp b/src/plugins/intel_gpu/src/plugin/transformations/convert_fc_to_compressed.cpp index eb2277cd185998..0e579843006d7c 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/convert_fc_to_compressed.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/convert_fc_to_compressed.cpp @@ -115,13 +115,13 @@ ConvertFullyConnectedToFullyConnectedCompressed::ConvertFullyConnectedToFullyCon std::shared_ptr result = nullptr; // Convert ZP to u8 if (constant->get_element_type() == ov::element::u8) - result = ov::as_type_ptr(constant); + result = std::dynamic_pointer_cast(constant); else if (constant->get_element_type() == ov::element::u4) - result = ov::as_type_ptr(std::make_shared(node, ov::element::u8)); + result = std::dynamic_pointer_cast(std::make_shared(node, ov::element::u8)); else if (weight_u8 && sub_with_convert) - result = ov::as_type_ptr(std::make_shared(node, ov::element::u8)); + result = std::dynamic_pointer_cast(std::make_shared(node, ov::element::u8)); else - result = ov::as_type_ptr(constant); + result = std::dynamic_pointer_cast(constant); ov::copy_weightless_cache_attr(node, result); return result; From 81aca3e1f223595cdd9bde156c82330cf9e0e345 Mon Sep 17 00:00:00 2001 From: Tomasz Krupa Date: Thu, 23 Jan 2025 13:03:11 +0000 Subject: [PATCH 03/13] clang-format --- src/core/src/op/util/weightless_caching_attributes.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/core/src/op/util/weightless_caching_attributes.cpp b/src/core/src/op/util/weightless_caching_attributes.cpp index c08096a31109b9..10a49b31c9886f 100644 --- a/src/core/src/op/util/weightless_caching_attributes.cpp +++ b/src/core/src/op/util/weightless_caching_attributes.cpp @@ -14,7 +14,6 @@ TRANSFORMATIONS_API void ov::copy_weightless_cache_attr(const std::shared_ptrget_rt_info()[ov::WeightlessCacheAttribute::get_type_info_static()] = - weightless_caching_attr->second; + to->get_rt_info()[ov::WeightlessCacheAttribute::get_type_info_static()] = weightless_caching_attr->second; } } \ No newline at end of file From 1a185cb360326672d6df894bec22858058706bdd Mon Sep 17 00:00:00 2001 From: Tomasz Krupa Date: Thu, 23 Jan 2025 14:25:44 +0000 Subject: [PATCH 04/13] Add missing nullptr check --- .../intel_gpu/src/graph/graph_optimizer/propagate_constants.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/propagate_constants.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/propagate_constants.cpp index 03e9d715b62e8c..d9f13375688c43 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/propagate_constants.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/propagate_constants.cpp @@ -95,7 +95,7 @@ void propagate_constants::run(program& p) { } } - if (*in_layout.get() != new_node.get_output_layout()) { + if (in_layout && *in_layout.get() != new_node.get_output_layout()) { new_node.as().get_primitive()->cache_info->apply_reorder(*in_layout.get(), new_node.get_output_layout()); } From f741706403244eee974c2d7fae2788031b092ffc Mon Sep 17 00:00:00 2001 From: Tomasz Krupa Date: Tue, 28 Jan 2025 13:25:04 +0000 Subject: [PATCH 05/13] Remove unnecessary structures and add precision conversion WA --- .../rt_info/weightless_caching_attributes.hpp | 9 +- .../op/util/weightless_caching_attributes.cpp | 10 +- .../include/intel_gpu/primitives/data.hpp | 241 ++++++++---------- .../intel_gpu/src/plugin/program_builder.cpp | 3 +- 4 files changed, 128 insertions(+), 135 deletions(-) diff --git a/src/core/dev_api/openvino/core/rt_info/weightless_caching_attributes.hpp b/src/core/dev_api/openvino/core/rt_info/weightless_caching_attributes.hpp index b7c6d538f6cc08..6c71e644886c9d 100644 --- a/src/core/dev_api/openvino/core/rt_info/weightless_caching_attributes.hpp +++ b/src/core/dev_api/openvino/core/rt_info/weightless_caching_attributes.hpp @@ -7,11 +7,12 @@ #include "openvino/core/core_visibility.hpp" #include "openvino/core/node.hpp" #include "openvino/core/runtime_attribute.hpp" -#include "transformations_visibility.hpp" namespace ov { -TRANSFORMATIONS_API void copy_weightless_cache_attr(const std::shared_ptr& from, const std::shared_ptr& to); +OPENVINO_API void copy_weightless_cache_attr(const std::shared_ptr& from, + const std::shared_ptr& to, + bool set_by_precision_conversion = false); /** * @brief Holds weightless caching attributes of a single constant. @@ -32,13 +33,15 @@ class OPENVINO_API WeightlessCacheAttribute : public RuntimeAttribute { WeightlessCacheAttribute(size_t original_size, size_t bin_offset, ov::element::Type original_dtype) : original_size(original_size), bin_offset(bin_offset), - original_dtype(original_dtype) {} + original_dtype(original_dtype), + set_by_convert_precision(false) {} bool is_copyable() const override; size_t original_size; size_t bin_offset; ov::element::Type original_dtype; + bool set_by_convert_precision; }; } // namespace ov diff --git a/src/core/src/op/util/weightless_caching_attributes.cpp b/src/core/src/op/util/weightless_caching_attributes.cpp index 10a49b31c9886f..3f1be0c8327731 100644 --- a/src/core/src/op/util/weightless_caching_attributes.cpp +++ b/src/core/src/op/util/weightless_caching_attributes.cpp @@ -8,12 +8,18 @@ bool ov::WeightlessCacheAttribute::is_copyable() const { return false; } -TRANSFORMATIONS_API void ov::copy_weightless_cache_attr(const std::shared_ptr& from, - const std::shared_ptr& to) { +OPENVINO_API void ov::copy_weightless_cache_attr(const std::shared_ptr& from, + const std::shared_ptr& to, + bool set_by_convert_precision) { const auto& rt_info = from->get_rt_info(); auto weightless_caching_attr = rt_info.find(ov::WeightlessCacheAttribute::get_type_info_static()); if (weightless_caching_attr != rt_info.end()) { to->get_rt_info()[ov::WeightlessCacheAttribute::get_type_info_static()] = weightless_caching_attr->second; + if (set_by_convert_precision) { + to->get_rt_info()[ov::WeightlessCacheAttribute::get_type_info_static()] + .as() + .set_by_convert_precision = true; + } } } \ No newline at end of file diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/data.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/data.hpp index 63de4f3e02cbe9..db6f088947cdca 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/primitives/data.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/data.hpp @@ -12,7 +12,9 @@ #include "intel_gpu/runtime/engine.hpp" #include "intel_gpu/runtime/memory.hpp" #include "openvino/op/constant.hpp" +#include "openvino/op/convert.hpp" #include "openvino/op/util/op_types.hpp" +#include "openvino/pass/constant_folding.hpp" #include "openvino/pass/manager.hpp" #include "openvino/runtime/shared_buffer.hpp" #include "openvino/util/mmap_object.hpp" @@ -21,77 +23,21 @@ namespace { -struct data_mem_wrapper { - cldnn::memory::ptr mem_ptr = nullptr; - cldnn::allocation_type mem_ptr_alloc_type = cldnn::allocation_type::unknown; - cldnn::layout output_layout{}; - size_t data_size = 0; -}; - -class MemoryManager { -public: - MemoryManager(data_mem_wrapper memory_info, - std::shared_ptr mapped_weights, - size_t bin_offset, - size_t original_size) - : memory_info(memory_info) { - shared_buf = - std::make_shared>>(mapped_weights->data() + bin_offset, - original_size, - mapped_weights); +bool is_alloc_host_accessible(const cldnn::allocation_type& alloc_type) { + return alloc_type == cldnn::allocation_type::usm_host || alloc_type == cldnn::allocation_type::usm_shared; +} + +void copy_to_dst_mem(cldnn::memory::ptr mem_ptr, const uint8_t* data_ptr) { + if (is_alloc_host_accessible(mem_ptr->get_allocation_type())) { + size_t data_size = mem_ptr->size(); + std::memcpy(reinterpret_cast(mem_ptr->buffer_ptr()), + data_ptr, + data_size); + } else { + auto& strm = mem_ptr->get_engine()->get_service_stream(); + mem_ptr->copy_from(strm, data_ptr); } - - void copy_to_mem(cldnn::engine& engine) { - OPENVINO_ASSERT(memory_info.mem_ptr_alloc_type != cldnn::allocation_type::unknown); - OPENVINO_ASSERT(!copied); - - if (memory_info.mem_ptr_alloc_type == cldnn::allocation_type::usm_host || - memory_info.mem_ptr_alloc_type == cldnn::allocation_type::usm_shared) { - std::memcpy(reinterpret_cast(memory_info.mem_ptr->buffer_ptr()), - get_loaded_data(), - memory_info.data_size); - } else { - auto& strm = engine.get_service_stream(); - auto data_ptr = get_loaded_data(); - memory_info.mem_ptr->copy_from(strm, data_ptr); - } - copied = true; - } - - void set_mem(cldnn::memory::ptr mem_ptr) { - memory_info.mem_ptr = mem_ptr; - } - - bool is_copied() { - return copied; - } - - std::shared_ptr>> get_shared_buf() { - return shared_buf; - } - - cldnn::memory::ptr get_mem_ptr() { - return memory_info.mem_ptr; - } - - void set_transformed_constant(std::shared_ptr constant) { - transformed_constant = constant; - } - -private: - std::shared_ptr>> shared_buf = nullptr; - std::shared_ptr transformed_constant = nullptr; - data_mem_wrapper memory_info{}; - bool copied = false; - - const uint8_t* get_loaded_data() { - if (transformed_constant) { - return reinterpret_cast(transformed_constant->get_data_ptr()); - } - OPENVINO_ASSERT(shared_buf); - return shared_buf->get_ptr(); - } -}; +} } // namespace @@ -108,12 +54,13 @@ struct weightless_cache_manager { size_t original_size, ov::element::Type original_dtype, ov::element::Type curr_dtype, - ov::Shape shape) { + ov::Shape shape, bool precision_conversion_set_by_transformation) { this->bin_offset = bin_offset; this->original_size = original_size; this->original_dtype = original_dtype; this->curr_dtype = curr_dtype; this->shape = shape; + this->precision_conversion_set_by_transformation = precision_conversion_set_by_transformation; do_weightless_caching = true; if (original_dtype != curr_dtype) { @@ -121,21 +68,12 @@ struct weightless_cache_manager { } } - void invalidate() { - do_weightless_caching = false; - } - void apply_reorder(layout input_layout, layout output_layout) { reorder_rep.do_reorder = true; reorder_rep.input_layout = input_layout; reorder_rep.output_layout = output_layout; } - void set_new_dtype(ov::element::Type curr_dtype) { - this->curr_dtype = curr_dtype; - do_precision_conversion = original_dtype != curr_dtype; - } - bool save(BinaryOutputBuffer& ob, size_t data_size) const { if (!do_weightless_caching) { ob << false; @@ -164,7 +102,7 @@ struct weightless_cache_manager { return true; } - bool load(BinaryInputBuffer& ib, data_mem_wrapper& mem_info, std::shared_ptr mapped_weights) { + bool load(BinaryInputBuffer& ib, memory::ptr dst_mem, std::shared_ptr mapped_weights) { ib >> do_weightless_caching; if (!do_weightless_caching) { return false; @@ -184,7 +122,7 @@ struct weightless_cache_manager { shape.resize(num_dims); ib >> make_data(shape.data(), num_dims * sizeof(ov::Shape::value_type)); } else { - original_size = mem_info.data_size; + original_size = dst_mem->size(); } ib >> reorder_rep.do_reorder; @@ -193,16 +131,20 @@ struct weightless_cache_manager { ib >> reorder_rep.output_layout; } - auto mem_obj = std::make_shared(mem_info, mapped_weights, bin_offset, original_size); + auto shared_buf = + std::make_shared>>(mapped_weights->data() + bin_offset, + original_size, + mapped_weights); if (should_run_transformations()) { - run_transformations(ib.get_engine(), mem_obj); + run_transformations(ib.get_engine(), dst_mem, shared_buf); } else { - mem_obj->copy_to_mem(ib.get_engine()); + copy_to_dst_mem(dst_mem, shared_buf->get_ptr()); } return true; } + private: bool do_weightless_caching = false; bool do_precision_conversion = false; @@ -212,63 +154,107 @@ struct weightless_cache_manager { size_t original_size = SIZE_MAX; ov::element::Type original_dtype = ov::element::Type_t::undefined; ov::element::Type curr_dtype = ov::element::Type_t::undefined; - ov::Shape shape; + ov::Shape shape{}; + bool precision_conversion_set_by_transformation = false; bool should_run_transformations() { return do_precision_conversion || reorder_rep.do_reorder; } - void run_transformations(engine& engine, std::shared_ptr mem_obj) { - auto orig_constant = std::make_shared(original_dtype, - shape, - mem_obj->get_shared_buf()->get_ptr(), - mem_obj->get_shared_buf()); + void run_transformations(engine& engine, + memory::ptr dst_mem, + std::shared_ptr>> shared_buf) { + std::shared_ptr transformed_constant = nullptr; - ov::ParameterVector inputParams; - ov::ResultVector results; - results.push_back(std::make_shared(orig_constant->output(0))); - auto model = std::make_shared(results, inputParams, "aux"); + // Note: this works only until the data is copied to dst_mem. + auto get_intermediate_data = [&]() -> const uint8_t* { + if (transformed_constant) { + return reinterpret_cast(transformed_constant->get_data_ptr()); + } + return shared_buf->get_ptr(); + }; - ov::pass::Manager manager("Plugin:GPU:weightless_cache_transformations"); + // Note: this works only until the data is copied to dst_mem. + auto get_current_data_size = [&]() -> size_t { + if (transformed_constant) { + return transformed_constant->get_byte_size(); + } + return original_size; + }; if (do_precision_conversion) { - precisions_map fp_convert_precision_map = {{original_dtype, curr_dtype}}; - type_to_fuse_map empty_fuse_map = {}; - const bool keep_precision_sensitive_in_fp32 = false; - const bool convert_input_output_precision = false; - const bool store_original_precision_as_rt_attribute = true; - manager.register_pass(fp_convert_precision_map, - empty_fuse_map, - keep_precision_sensitive_in_fp32, - convert_input_output_precision, - store_original_precision_as_rt_attribute); - } + auto orig_constant = std::make_shared(original_dtype, + shape, + get_intermediate_data(), + shared_buf); + + ov::ParameterVector inputParams; + ov::ResultVector results; + ov::pass::Manager manager("Plugin:GPU:weightless_cache_transformations"); + std::shared_ptr model = nullptr; + + if (precision_conversion_set_by_transformation) { + results.push_back(std::make_shared(orig_constant->output(0))); + model = std::make_shared(results, inputParams, "aux"); + + + precisions_map fp_convert_precision_map = {{original_dtype, curr_dtype}}; + type_to_fuse_map empty_fuse_map = {}; + const bool keep_precision_sensitive_in_fp32 = false; + const bool convert_input_output_precision = false; + const bool store_original_precision_as_rt_attribute = true; + manager.register_pass(fp_convert_precision_map, + empty_fuse_map, + keep_precision_sensitive_in_fp32, + convert_input_output_precision, + store_original_precision_as_rt_attribute); + } else { + auto convert_op = std::make_shared(orig_constant, curr_dtype); + results.push_back(std::make_shared(convert_op->output(0))); + model = std::make_shared(results, inputParams, "aux"); + + manager.register_pass(); + } - manager.run_passes(model); - const auto& ops = model->get_ops(); - auto it = std::find_if(ops.begin(), ops.end(), [](const std::shared_ptr& node) { - return ov::op::util::is_constant(node); - }); - OPENVINO_ASSERT(it != ops.end()); - auto transformed_constant = ov::as_type_ptr(*it); - OPENVINO_ASSERT(transformed_constant->get_element_type() == curr_dtype); - mem_obj->set_transformed_constant(transformed_constant); - mem_obj->copy_to_mem(engine); + manager.run_passes(model); + const auto& ops = model->get_ops(); + auto it = std::find_if(ops.begin(), ops.end(), [](const std::shared_ptr& node) { + return ov::op::util::is_constant(node); + }); + OPENVINO_ASSERT(it != ops.end()); + transformed_constant = ov::as_type_ptr(*it); + OPENVINO_ASSERT(transformed_constant->get_element_type() == curr_dtype); + } if (reorder_rep.do_reorder) { - OPENVINO_ASSERT(reorder_rep.input_layout == mem_obj->get_mem_ptr()->get_layout()); + const auto allocation_type = dst_mem->get_allocation_type(); + memory::ptr input_mem = engine.allocate_memory(reorder_rep.input_layout, allocation_type, false); + + if (is_alloc_host_accessible(allocation_type)) { + std::memcpy(reinterpret_cast(input_mem->buffer_ptr()), + get_intermediate_data(), + get_current_data_size()); + } else { + auto& strm = engine.get_service_stream(); + input_mem->copy_from(strm, get_intermediate_data()); + } + topology topology(input_layout("input", reorder_rep.input_layout), reorder("reorder", input_info("input"), reorder_rep.output_layout)); ExecutionConfig config{}; + if (engine.get_device_info().supports_immad) { + config.set_property(ov::intel_gpu::queue_type(QueueTypes::in_order)); + } + ov::intel_gpu::ImplementationDesc reorder_ref = {reorder_rep.output_layout.format, "reorder_data"}; + config.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ {"reorder", reorder_ref} })); cldnn::network network(engine, topology, config); - memory::ptr input_mem = mem_obj->get_mem_ptr(); network.set_input_data("input", input_mem); + network.set_output_memory("reorder", dst_mem); auto outputs = network.execute(); OPENVINO_ASSERT(outputs.size() == 1); - memory::ptr output_mem = outputs.begin()->second.get_memory(); - OPENVINO_ASSERT(input_mem->size() == output_mem->size()); - mem_obj->set_mem(output_mem); + } else { + copy_to_dst_mem(dst_mem, get_intermediate_data()); } } }; @@ -326,7 +312,7 @@ struct data : public primitive_base { bool do_weightless_caching = cache_info->save(ob, data_size); if (!do_weightless_caching) { - if (_allocation_type == allocation_type::usm_host || _allocation_type == allocation_type::usm_shared) { + if (is_alloc_host_accessible(_allocation_type)) { ob << make_data(mem->buffer_ptr(), data_size); } else { std::vector _buf; @@ -354,13 +340,10 @@ struct data : public primitive_base { mem = ib.get_engine().allocate_memory(output_layout, _allocation_type, false); - data_mem_wrapper mem_info{mem, _allocation_type, output_layout, data_size}; - bool is_weightless_caching = cache_info->load(ib, mem_info, mapped_weights); + bool is_weightless_caching = cache_info->load(ib, mem, mapped_weights); - if (is_weightless_caching) { - mem = mem_info.mem_ptr; - } else { - if (_allocation_type == allocation_type::usm_host || _allocation_type == allocation_type::usm_shared) { + if (!is_weightless_caching) { + if (is_alloc_host_accessible(_allocation_type)) { ib >> make_data(mem->buffer_ptr(), data_size); } else { const size_t DATA_BLOCK_SIZE = 2 * 1024 * 1024; diff --git a/src/plugins/intel_gpu/src/plugin/program_builder.cpp b/src/plugins/intel_gpu/src/plugin/program_builder.cpp index b1d43fe20b11ae..8bded8ccb028d2 100644 --- a/src/plugins/intel_gpu/src/plugin/program_builder.cpp +++ b/src/plugins/intel_gpu/src/plugin/program_builder.cpp @@ -321,7 +321,8 @@ void ProgramBuilder::add_primitive(const ov::Node& op, std::shared_ptr Date: Tue, 28 Jan 2025 14:09:39 +0000 Subject: [PATCH 06/13] Remove ConvertPrecision WA and use Convert op + ConstantFolding for both paths --- .../rt_info/weightless_caching_attributes.hpp | 4 +-- .../op/util/weightless_caching_attributes.cpp | 8 +---- .../include/intel_gpu/primitives/data.hpp | 30 ++++--------------- .../intel_gpu/src/plugin/program_builder.cpp | 3 +- 4 files changed, 8 insertions(+), 37 deletions(-) diff --git a/src/core/dev_api/openvino/core/rt_info/weightless_caching_attributes.hpp b/src/core/dev_api/openvino/core/rt_info/weightless_caching_attributes.hpp index 6c71e644886c9d..a3708983b0aa12 100644 --- a/src/core/dev_api/openvino/core/rt_info/weightless_caching_attributes.hpp +++ b/src/core/dev_api/openvino/core/rt_info/weightless_caching_attributes.hpp @@ -10,9 +10,7 @@ namespace ov { -OPENVINO_API void copy_weightless_cache_attr(const std::shared_ptr& from, - const std::shared_ptr& to, - bool set_by_precision_conversion = false); +OPENVINO_API void copy_weightless_cache_attr(const std::shared_ptr& from, const std::shared_ptr& to); /** * @brief Holds weightless caching attributes of a single constant. diff --git a/src/core/src/op/util/weightless_caching_attributes.cpp b/src/core/src/op/util/weightless_caching_attributes.cpp index 3f1be0c8327731..1b2745ce0ae7ec 100644 --- a/src/core/src/op/util/weightless_caching_attributes.cpp +++ b/src/core/src/op/util/weightless_caching_attributes.cpp @@ -9,17 +9,11 @@ bool ov::WeightlessCacheAttribute::is_copyable() const { } OPENVINO_API void ov::copy_weightless_cache_attr(const std::shared_ptr& from, - const std::shared_ptr& to, - bool set_by_convert_precision) { + const std::shared_ptr& to) { const auto& rt_info = from->get_rt_info(); auto weightless_caching_attr = rt_info.find(ov::WeightlessCacheAttribute::get_type_info_static()); if (weightless_caching_attr != rt_info.end()) { to->get_rt_info()[ov::WeightlessCacheAttribute::get_type_info_static()] = weightless_caching_attr->second; - if (set_by_convert_precision) { - to->get_rt_info()[ov::WeightlessCacheAttribute::get_type_info_static()] - .as() - .set_by_convert_precision = true; - } } } \ No newline at end of file diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/data.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/data.hpp index db6f088947cdca..166d6ad56b01f1 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/primitives/data.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/data.hpp @@ -54,13 +54,12 @@ struct weightless_cache_manager { size_t original_size, ov::element::Type original_dtype, ov::element::Type curr_dtype, - ov::Shape shape, bool precision_conversion_set_by_transformation) { + ov::Shape shape) { this->bin_offset = bin_offset; this->original_size = original_size; this->original_dtype = original_dtype; this->curr_dtype = curr_dtype; this->shape = shape; - this->precision_conversion_set_by_transformation = precision_conversion_set_by_transformation; do_weightless_caching = true; if (original_dtype != curr_dtype) { @@ -155,7 +154,6 @@ struct weightless_cache_manager { ov::element::Type original_dtype = ov::element::Type_t::undefined; ov::element::Type curr_dtype = ov::element::Type_t::undefined; ov::Shape shape{}; - bool precision_conversion_set_by_transformation = false; bool should_run_transformations() { return do_precision_conversion || reorder_rep.do_reorder; @@ -193,28 +191,10 @@ struct weightless_cache_manager { ov::pass::Manager manager("Plugin:GPU:weightless_cache_transformations"); std::shared_ptr model = nullptr; - if (precision_conversion_set_by_transformation) { - results.push_back(std::make_shared(orig_constant->output(0))); - model = std::make_shared(results, inputParams, "aux"); - - - precisions_map fp_convert_precision_map = {{original_dtype, curr_dtype}}; - type_to_fuse_map empty_fuse_map = {}; - const bool keep_precision_sensitive_in_fp32 = false; - const bool convert_input_output_precision = false; - const bool store_original_precision_as_rt_attribute = true; - manager.register_pass(fp_convert_precision_map, - empty_fuse_map, - keep_precision_sensitive_in_fp32, - convert_input_output_precision, - store_original_precision_as_rt_attribute); - } else { - auto convert_op = std::make_shared(orig_constant, curr_dtype); - results.push_back(std::make_shared(convert_op->output(0))); - model = std::make_shared(results, inputParams, "aux"); - - manager.register_pass(); - } + auto convert_op = std::make_shared(orig_constant, curr_dtype); + results.push_back(std::make_shared(convert_op->output(0))); + model = std::make_shared(results, inputParams, "aux"); + manager.register_pass(); manager.run_passes(model); const auto& ops = model->get_ops(); diff --git a/src/plugins/intel_gpu/src/plugin/program_builder.cpp b/src/plugins/intel_gpu/src/plugin/program_builder.cpp index 8bded8ccb028d2..b1d43fe20b11ae 100644 --- a/src/plugins/intel_gpu/src/plugin/program_builder.cpp +++ b/src/plugins/intel_gpu/src/plugin/program_builder.cpp @@ -321,8 +321,7 @@ void ProgramBuilder::add_primitive(const ov::Node& op, std::shared_ptr Date: Tue, 28 Jan 2025 14:12:04 +0000 Subject: [PATCH 07/13] Remove missed field --- .../openvino/core/rt_info/weightless_caching_attributes.hpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/core/dev_api/openvino/core/rt_info/weightless_caching_attributes.hpp b/src/core/dev_api/openvino/core/rt_info/weightless_caching_attributes.hpp index a3708983b0aa12..f5c7d3446dbcb6 100644 --- a/src/core/dev_api/openvino/core/rt_info/weightless_caching_attributes.hpp +++ b/src/core/dev_api/openvino/core/rt_info/weightless_caching_attributes.hpp @@ -31,15 +31,13 @@ class OPENVINO_API WeightlessCacheAttribute : public RuntimeAttribute { WeightlessCacheAttribute(size_t original_size, size_t bin_offset, ov::element::Type original_dtype) : original_size(original_size), bin_offset(bin_offset), - original_dtype(original_dtype), - set_by_convert_precision(false) {} + original_dtype(original_dtype) {} bool is_copyable() const override; size_t original_size; size_t bin_offset; ov::element::Type original_dtype; - bool set_by_convert_precision; }; } // namespace ov From 29b0bd0768cff0e271742b4152f1c917a2419561 Mon Sep 17 00:00:00 2001 From: Tomasz Krupa Date: Wed, 29 Jan 2025 08:09:08 +0000 Subject: [PATCH 08/13] Don't force reorder impl --- .../intel_gpu/include/intel_gpu/primitives/data.hpp | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/data.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/data.hpp index 166d6ad56b01f1..a09f5277bcfcb2 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/primitives/data.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/data.hpp @@ -221,17 +221,11 @@ struct weightless_cache_manager { topology topology(input_layout("input", reorder_rep.input_layout), reorder("reorder", input_info("input"), reorder_rep.output_layout)); - ExecutionConfig config{}; - if (engine.get_device_info().supports_immad) { - config.set_property(ov::intel_gpu::queue_type(QueueTypes::in_order)); - } - - ov::intel_gpu::ImplementationDesc reorder_ref = {reorder_rep.output_layout.format, "reorder_data"}; - config.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ {"reorder", reorder_ref} })); - cldnn::network network(engine, topology, config); + cldnn::network network(engine, topology, {}); network.set_input_data("input", input_mem); network.set_output_memory("reorder", dst_mem); auto outputs = network.execute(); + network.reset_execution(true); OPENVINO_ASSERT(outputs.size() == 1); } else { copy_to_dst_mem(dst_mem, get_intermediate_data()); From 57361a75c2e5f3bb7b8ec61a41615ad22ffd4e7e Mon Sep 17 00:00:00 2001 From: Tomasz Krupa Date: Fri, 31 Jan 2025 14:55:05 +0000 Subject: [PATCH 09/13] Support all possible reorders --- .../include/intel_gpu/primitives/data.hpp | 47 +++++++++++-------- .../graph_optimizer/propagate_constants.cpp | 32 +++++++------ .../src/graph/include/pass_manager.h | 5 +- 3 files changed, 48 insertions(+), 36 deletions(-) diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/data.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/data.hpp index a09f5277bcfcb2..98ea343a343c3c 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/primitives/data.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/data.hpp @@ -44,9 +44,8 @@ void copy_to_dst_mem(cldnn::memory::ptr mem_ptr, const uint8_t* data_ptr) { namespace cldnn { struct reorder_replication { - bool do_reorder = false; - cldnn::layout input_layout = {}; - cldnn::layout output_layout = {}; + std::shared_ptr input_layout = nullptr; + std::shared_ptr reorder = nullptr; }; struct weightless_cache_manager { @@ -67,10 +66,8 @@ struct weightless_cache_manager { } } - void apply_reorder(layout input_layout, layout output_layout) { - reorder_rep.do_reorder = true; - reorder_rep.input_layout = input_layout; - reorder_rep.output_layout = output_layout; + void apply_reorder(std::shared_ptr input_layout, std::shared_ptr reorder) { + reorder_rep = {input_layout, reorder}; } bool save(BinaryOutputBuffer& ob, size_t data_size) const { @@ -91,10 +88,12 @@ struct weightless_cache_manager { ob << make_data(&num_dims, sizeof(size_t)); ob << make_data(shape.data(), num_dims * sizeof(ov::Shape::value_type)); } - if (reorder_rep.do_reorder) { + + bool do_reorder = should_run_reorder(); + if (do_reorder) { ob << true; - ob << reorder_rep.input_layout; - ob << reorder_rep.output_layout; + ob << *reorder_rep.input_layout; + ob << *reorder_rep.reorder; } else { ob << false; } @@ -124,10 +123,13 @@ struct weightless_cache_manager { original_size = dst_mem->size(); } - ib >> reorder_rep.do_reorder; - if (reorder_rep.do_reorder) { - ib >> reorder_rep.input_layout; - ib >> reorder_rep.output_layout; + bool do_reorder = false; + ib >> do_reorder; + if (do_reorder) { + reorder_rep.input_layout = std::make_shared(); + ib >> *reorder_rep.input_layout; + reorder_rep.reorder = std::make_shared(); + ib >> *reorder_rep.reorder; } auto shared_buf = @@ -155,8 +157,12 @@ struct weightless_cache_manager { ov::element::Type curr_dtype = ov::element::Type_t::undefined; ov::Shape shape{}; + bool should_run_reorder() const { + return reorder_rep.reorder != nullptr; + } + bool should_run_transformations() { - return do_precision_conversion || reorder_rep.do_reorder; + return do_precision_conversion || should_run_reorder(); } void run_transformations(engine& engine, @@ -206,9 +212,9 @@ struct weightless_cache_manager { OPENVINO_ASSERT(transformed_constant->get_element_type() == curr_dtype); } - if (reorder_rep.do_reorder) { + if (should_run_reorder()) { const auto allocation_type = dst_mem->get_allocation_type(); - memory::ptr input_mem = engine.allocate_memory(reorder_rep.input_layout, allocation_type, false); + memory::ptr input_mem = engine.allocate_memory(*reorder_rep.input_layout, allocation_type, false); if (is_alloc_host_accessible(allocation_type)) { std::memcpy(reinterpret_cast(input_mem->buffer_ptr()), @@ -219,11 +225,12 @@ struct weightless_cache_manager { input_mem->copy_from(strm, get_intermediate_data()); } - topology topology(input_layout("input", reorder_rep.input_layout), - reorder("reorder", input_info("input"), reorder_rep.output_layout)); + reorder_rep.reorder->input = {input_info("input")}; + topology topology(input_layout("input", *reorder_rep.input_layout), + *reorder_rep.reorder); cldnn::network network(engine, topology, {}); network.set_input_data("input", input_mem); - network.set_output_memory("reorder", dst_mem); + network.set_output_memory(reorder_rep.reorder->id, dst_mem); auto outputs = network.execute(); network.reset_execution(true); OPENVINO_ASSERT(outputs.size() == 1); diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/propagate_constants.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/propagate_constants.cpp index d9f13375688c43..34314155837197 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/propagate_constants.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/propagate_constants.cpp @@ -77,11 +77,13 @@ void propagate_constants::run(program& p) { auto& id_to_replace = std::get<0>(cout); auto mem_impl = std::get<1>(cout); auto cache_info = std::get<2>(cout); - auto in_layout = std::get<3>(cout); + auto cache_manager = std::get<0>(cache_info); + auto in_layout = std::get<1>(cache_info); + auto reorder = std::get<2>(cache_info); auto const_data = std::make_shared("_cldnn_const_prop_" + id_to_replace, mem_impl, /* <<< REMOVE ME WHEN POSSIBLE */ - cache_info); + cache_manager); auto& new_node = p.get_or_create(const_data); auto& curr_node = p.get_node(id_to_replace); @@ -95,9 +97,8 @@ void propagate_constants::run(program& p) { } } - if (in_layout && *in_layout.get() != new_node.get_output_layout()) { - new_node.as().get_primitive()->cache_info->apply_reorder(*in_layout.get(), - new_node.get_output_layout()); + if (in_layout && reorder) { + new_node.as().get_primitive()->cache_info->apply_reorder(in_layout, reorder); } curr_node.dependencies.clear(); @@ -121,7 +122,10 @@ bool propagate_constants::has_non_const_user(program_node& node) const { return false; } -std::list, std::shared_ptr>> +using cache_tuple = + std::tuple, std::shared_ptr, std::shared_ptr>; + +std::list> propagate_constants::calculate(engine& engine, const ExecutionConfig& config, std::shared_ptr task_executor) { @@ -132,18 +136,18 @@ propagate_constants::calculate(engine& engine, cf_config.set_property(ov::intel_gpu::optimize_data(false)); cf_config.set_property(ov::intel_gpu::custom_outputs(const_outputs)); network::ptr net = network::build_network(engine, nodes, cf_config, task_executor, true); - std::map, std::shared_ptr>> - weightless_cache_map; + std::map weightless_cache_map; for (auto& cin : const_inputs) { net->set_input_data(cin->id(), cin->get_attached_memory_ptr()); auto users = cin->get_users(); if (users.size() == 1 && users.front()->is_type()) { auto rprim = users.front()->as().get_primitive(); + auto copy = std::shared_ptr(new reorder(*rprim)); auto id = rprim->id; auto cache_ptr = cin->as().get_primitive()->cache_info; auto layout_ptr = std::make_shared(cin->get_output_layout()); - weightless_cache_map.emplace(id, std::make_pair(cache_ptr, layout_ptr)); + weightless_cache_map.emplace(id, std::make_tuple(cache_ptr, layout_ptr, copy)); } } @@ -151,17 +155,15 @@ propagate_constants::calculate(engine& engine, net->reset_execution(true); // wait for computations to complete auto outputs = net->get_outputs(); - std::list, std::shared_ptr>> + std::list> ret; for (auto& out : outputs) { - std::shared_ptr cache_ptr = nullptr; - std::shared_ptr layout_ptr = nullptr; + cache_tuple cache_info{}; auto it = weightless_cache_map.find(out->id()); if (it != weightless_cache_map.end()) { - cache_ptr = it->second.first; - layout_ptr = it->second.second; + cache_info = it->second; } - ret.push_back({out->id(), out->output_memory_ptr(), cache_ptr, layout_ptr}); + ret.push_back({out->id(), out->output_memory_ptr(), cache_info}); } return ret; diff --git a/src/plugins/intel_gpu/src/graph/include/pass_manager.h b/src/plugins/intel_gpu/src/graph/include/pass_manager.h index 0bdcf4ef82672e..fb7f05927b249c 100644 --- a/src/plugins/intel_gpu/src/graph/include/pass_manager.h +++ b/src/plugins/intel_gpu/src/graph/include/pass_manager.h @@ -222,7 +222,10 @@ class propagate_constants : public base_pass { private: void run(program& p) override; - std::list, std::shared_ptr>> + std::list, std::shared_ptr, std::shared_ptr>>> calculate(engine& engine, const ExecutionConfig& config, std::shared_ptr task_executor); From 7176273bafb68d2e274150b7aa0c4a5aed74dac1 Mon Sep 17 00:00:00 2001 From: Tomasz Krupa Date: Wed, 5 Feb 2025 12:03:38 +0000 Subject: [PATCH 10/13] Make network sync more efficient --- src/plugins/intel_gpu/include/intel_gpu/primitives/data.hpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/data.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/data.hpp index 98ea343a343c3c..f06d74ea2567ca 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/primitives/data.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/data.hpp @@ -232,7 +232,10 @@ struct weightless_cache_manager { network.set_input_data("input", input_mem); network.set_output_memory(reorder_rep.reorder->id, dst_mem); auto outputs = network.execute(); - network.reset_execution(true); + for (const auto& output : outputs) { + output.second.get_event()->wait(); + } + OPENVINO_ASSERT(outputs.size() == 1); } else { copy_to_dst_mem(dst_mem, get_intermediate_data()); From a823f0dd5c2a18cd6456fc0dc8ba3462f313c36f Mon Sep 17 00:00:00 2001 From: Tomasz Krupa Date: Thu, 6 Feb 2025 10:03:05 +0000 Subject: [PATCH 11/13] Add low precision tests --- .../tests/functional/behavior/model_cache.cpp | 46 ++++++++++++++++++- 1 file changed, 45 insertions(+), 1 deletion(-) diff --git a/src/plugins/intel_gpu/tests/functional/behavior/model_cache.cpp b/src/plugins/intel_gpu/tests/functional/behavior/model_cache.cpp index 1f911d4a0f2070..ddae706cf638c6 100644 --- a/src/plugins/intel_gpu/tests/functional/behavior/model_cache.cpp +++ b/src/plugins/intel_gpu/tests/functional/behavior/model_cache.cpp @@ -14,6 +14,7 @@ #include "common_test_utils/test_common.hpp" #include "openvino/pass/serialize.hpp" #include "openvino/util/codec_xor.hpp" +#include "shared_test_classes/subgraph/weights_decompression_builders.hpp" namespace { typedef std::tuple testParams; @@ -30,7 +31,7 @@ class CheckWeightlessCacheAccuracy : public ::testing::Test, public ::testing::W std::ostringstream result; const char separator = '_'; result << "use_compile_model_api=" << use_compile_model_api_ << separator; - result << "_do_encryption=" << do_encryption_; + result << "do_encryption=" << do_encryption_ << separator; result << "inference_mode=" << inference_mode_ << separator; result << "model_dtype=" << model_dtype_; return result.str(); @@ -148,6 +149,35 @@ TEST_P(CheckWeightlessCacheAccuracy, TiWithLstmCell) { OV_ASSERT_NO_THROW(run()); } +class CheckWeightlessCacheAccuracyLowPrecision : public CheckWeightlessCacheAccuracy {}; + +TEST_P(CheckWeightlessCacheAccuracyLowPrecision, MatmulWeightsDecompression) { + ov::test::MatMulDecompressionShapeParams shape_params{{{}, {{1, 4, 16}}}, {1, 16, 32}}; + auto dynShape = shape_params.data_shape.first; + if (dynShape.rank() == 0) { + dynShape = shape_params.data_shape.second.front(); + } + ov::ParameterVector params{std::make_shared(ov::element::f32, dynShape)}; + const auto weights_subgraph = ov::test::initMatMulDecompressionSubgraph(shape_params.weights_shape, + shape_params.decompression_group_size, + ov::element::f32, + model_dtype, + ov::element::f32, + ov::element::undefined, + true, + ov::test::DecompressionType::full, + ov::test::DecompressionType::full, + false); + auto matmul = std::make_shared(params[0], weights_subgraph); + + ov::ResultVector results; + for (const auto& output : matmul->outputs()) { + results.push_back(std::make_shared(output)); + } + model = std::make_shared(results, params, "MatmulWeightsDecompression"); + OV_ASSERT_NO_THROW(run()); +} + const std::vector inference_modes = { ov::element::f32, ov::element::f16, @@ -159,6 +189,12 @@ const std::vector model_dtypes = { ov::element::bf16, }; +const std::vector low_precision_dtypes = { + ov::element::u8, + ov::element::u4, + ov::element::i4, +}; + INSTANTIATE_TEST_SUITE_P(smoke_CheckWeightlessCacheAccuracy, CheckWeightlessCacheAccuracy, ::testing::Combine(::testing::Bool(), @@ -167,4 +203,12 @@ INSTANTIATE_TEST_SUITE_P(smoke_CheckWeightlessCacheAccuracy, ::testing::ValuesIn(model_dtypes)), CheckWeightlessCacheAccuracy::get_test_case_name); +INSTANTIATE_TEST_SUITE_P(smoke_CheckWeightlessCacheAccuracyLowPrecision, + CheckWeightlessCacheAccuracyLowPrecision, + ::testing::Combine(::testing::Bool(), + ::testing::Bool(), + ::testing::ValuesIn(inference_modes), + ::testing::ValuesIn(low_precision_dtypes)), + CheckWeightlessCacheAccuracy::get_test_case_name); + } // namespace From 76ca48c18801c9f2fa65b3ce7bef34f823adc1b0 Mon Sep 17 00:00:00 2001 From: Tomasz Krupa Date: Thu, 6 Feb 2025 14:37:36 +0000 Subject: [PATCH 12/13] Add checking if cache is not regenerated during the second run --- .../tests/functional/behavior/model_cache.cpp | 44 ++++++++++++++++++- 1 file changed, 42 insertions(+), 2 deletions(-) diff --git a/src/plugins/intel_gpu/tests/functional/behavior/model_cache.cpp b/src/plugins/intel_gpu/tests/functional/behavior/model_cache.cpp index ddae706cf638c6..78ef5562a62370 100644 --- a/src/plugins/intel_gpu/tests/functional/behavior/model_cache.cpp +++ b/src/plugins/intel_gpu/tests/functional/behavior/model_cache.cpp @@ -2,6 +2,9 @@ // SPDX-License-Identifier: Apache-2.0 // +#include +#include + #include #include "base/ov_behavior_test_utils.hpp" @@ -15,6 +18,13 @@ #include "openvino/pass/serialize.hpp" #include "openvino/util/codec_xor.hpp" #include "shared_test_classes/subgraph/weights_decompression_builders.hpp" +#ifndef WIN32 +# include +#endif + +#ifdef WIN32 +# define stat _stat +#endif namespace { typedef std::tuple testParams; @@ -100,14 +110,44 @@ void CheckWeightlessCacheAccuracy::run() { ofstr.close(); } - auto ifstr = std::ifstream(cache_path, std::ifstream::binary); + auto get_cache_path = [&]() { + std::string path; + if (use_compile_model_api) { + auto blobs = ov::test::utils::listFilesWithExt(cache_dir, "blob"); + EXPECT_EQ(blobs.size(), 1); + path = blobs[0]; + } else { + path = cache_path; + } + return path; + }; + + auto get_mod_time = [&](const std::string& path) { + struct stat result; + if (stat(path.c_str(), &result) == 0) { + return result.st_mtime; + } + return static_cast<__time_t>(0); + }; + + auto first_cache_path = get_cache_path(); + auto first_mod_time = get_mod_time(first_cache_path); + ASSERT_NE(first_mod_time, static_cast<__time_t>(0)); + ov::CompiledModel imported_model; if (use_compile_model_api) { imported_model = core->compile_model(xml_path, ov::test::utils::DEVICE_GPU, config); } else { + auto ifstr = std::ifstream(cache_path, std::ifstream::binary); imported_model = core->import_model(ifstr, ov::test::utils::DEVICE_GPU, config_with_weights_path); + ifstr.close(); } - ifstr.close(); + + auto second_cache_path = get_cache_path(); + auto second_mod_time = get_mod_time(second_cache_path); + + // Something went wrong if a new cache is created during the second run. + ASSERT_EQ(first_mod_time, second_mod_time); auto orig_req = compiled_model.create_infer_request(); auto new_req = imported_model.create_infer_request(); From c85cf79f99938f396f0d40f1dc561d1e3bb460ae Mon Sep 17 00:00:00 2001 From: Tomasz Krupa Date: Thu, 6 Feb 2025 17:13:10 +0000 Subject: [PATCH 13/13] Change __time_t to time_t --- .../intel_gpu/tests/functional/behavior/model_cache.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/plugins/intel_gpu/tests/functional/behavior/model_cache.cpp b/src/plugins/intel_gpu/tests/functional/behavior/model_cache.cpp index 78ef5562a62370..1257ee02d1e69b 100644 --- a/src/plugins/intel_gpu/tests/functional/behavior/model_cache.cpp +++ b/src/plugins/intel_gpu/tests/functional/behavior/model_cache.cpp @@ -127,12 +127,12 @@ void CheckWeightlessCacheAccuracy::run() { if (stat(path.c_str(), &result) == 0) { return result.st_mtime; } - return static_cast<__time_t>(0); + return static_cast(0); }; auto first_cache_path = get_cache_path(); auto first_mod_time = get_mod_time(first_cache_path); - ASSERT_NE(first_mod_time, static_cast<__time_t>(0)); + ASSERT_NE(first_mod_time, static_cast(0)); ov::CompiledModel imported_model; if (use_compile_model_api) {