diff --git a/src/common/transformations/src/transformations/convert_precision.cpp b/src/common/transformations/src/transformations/convert_precision.cpp index fb0fe36a80cd2e..c829069738d9d0 100644 --- a/src/common/transformations/src/transformations/convert_precision.cpp +++ b/src/common/transformations/src/transformations/convert_precision.cpp @@ -1417,13 +1417,7 @@ bool fuse_type_to_constant(const std::shared_ptr& node, new_const->validate_and_infer_types(); new_const->set_friendly_name(constant->get_friendly_name()); ov::copy_runtime_info(constant, new_const); - - const auto& rt_info = node->get_rt_info(); - auto weightless_caching_attr = rt_info.find(ov::WeightlessCacheAttribute::get_type_info_static()); - if (weightless_caching_attr != rt_info.end()) { - new_const->get_rt_info()[ov::WeightlessCacheAttribute::get_type_info_static()] = - weightless_caching_attr->second; - } + ov::copy_weightless_cache_attr(constant, new_const); return true; } return false; diff --git a/src/core/dev_api/openvino/core/rt_info/weightless_caching_attributes.hpp b/src/core/dev_api/openvino/core/rt_info/weightless_caching_attributes.hpp index 49f67f91e5cbde..f5c7d3446dbcb6 100644 --- a/src/core/dev_api/openvino/core/rt_info/weightless_caching_attributes.hpp +++ b/src/core/dev_api/openvino/core/rt_info/weightless_caching_attributes.hpp @@ -10,6 +10,8 @@ namespace ov { +OPENVINO_API void copy_weightless_cache_attr(const std::shared_ptr& from, const std::shared_ptr& to); + /** * @brief Holds weightless caching attributes of a single constant. * diff --git a/src/core/src/op/util/weightless_caching_attributes.cpp b/src/core/src/op/util/weightless_caching_attributes.cpp index 7c540f8a3bef02..1b2745ce0ae7ec 100644 --- a/src/core/src/op/util/weightless_caching_attributes.cpp +++ b/src/core/src/op/util/weightless_caching_attributes.cpp @@ -7,3 +7,13 @@ bool ov::WeightlessCacheAttribute::is_copyable() const { return false; } + +OPENVINO_API void ov::copy_weightless_cache_attr(const std::shared_ptr& from, + const std::shared_ptr& to) { + const auto& rt_info = from->get_rt_info(); + auto weightless_caching_attr = rt_info.find(ov::WeightlessCacheAttribute::get_type_info_static()); + + if (weightless_caching_attr != rt_info.end()) { + to->get_rt_info()[ov::WeightlessCacheAttribute::get_type_info_static()] = weightless_caching_attr->second; + } +} \ No newline at end of file diff --git a/src/core/src/pass/constant_folding.cpp b/src/core/src/pass/constant_folding.cpp index e55abd0fb251ea..ae8d74e737ccf0 100644 --- a/src/core/src/pass/constant_folding.cpp +++ b/src/core/src/pass/constant_folding.cpp @@ -7,6 +7,7 @@ #include "openvino/cc/pass/itt.hpp" #include "openvino/core/constant_fold_utils.hpp" #include "openvino/core/rt_info.hpp" +#include "openvino/core/rt_info/weightless_caching_attributes.hpp" #include "openvino/op/constant.hpp" #include "openvino/op/convert.hpp" #include "openvino/op/util/op_types.hpp" @@ -153,6 +154,7 @@ bool ov::pass::ConstantFolding::run_on_model(const std::shared_ptr& m copy_runtime_info_from_input_values(original_node); // Propagate runtime info attributes to replacement copy_runtime_info(original_node, replacement_ptr); + ov::copy_weightless_cache_attr(original_node, replacement_ptr); rewritten = true; } diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/data.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/data.hpp index a09401af24d043..f06d74ea2567ca 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/primitives/data.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/data.hpp @@ -3,32 +3,49 @@ // #pragma once -#include #include +#include +#include "intel_gpu/graph/network.hpp" +#include "intel_gpu/primitives/input_layout.hpp" +#include "intel_gpu/primitives/reorder.hpp" #include "intel_gpu/runtime/engine.hpp" #include "intel_gpu/runtime/memory.hpp" #include "openvino/op/constant.hpp" +#include "openvino/op/convert.hpp" #include "openvino/op/util/op_types.hpp" +#include "openvino/pass/constant_folding.hpp" #include "openvino/pass/manager.hpp" #include "openvino/runtime/shared_buffer.hpp" #include "openvino/util/mmap_object.hpp" #include "primitive.hpp" #include "transformations/convert_precision.hpp" -namespace cldnn { +namespace { + +bool is_alloc_host_accessible(const cldnn::allocation_type& alloc_type) { + return alloc_type == cldnn::allocation_type::usm_host || alloc_type == cldnn::allocation_type::usm_shared; +} + +void copy_to_dst_mem(cldnn::memory::ptr mem_ptr, const uint8_t* data_ptr) { + if (is_alloc_host_accessible(mem_ptr->get_allocation_type())) { + size_t data_size = mem_ptr->size(); + std::memcpy(reinterpret_cast(mem_ptr->buffer_ptr()), + data_ptr, + data_size); + } else { + auto& strm = mem_ptr->get_engine()->get_service_stream(); + mem_ptr->copy_from(strm, data_ptr); + } +} -struct weights_mem { - std::shared_ptr>> shared_buf = nullptr; - std::shared_ptr transformed_constant = nullptr; +} // namespace - const uint8_t* get_loaded_data() { - if (transformed_constant) { - return reinterpret_cast(transformed_constant->get_data_ptr()); - } - OPENVINO_ASSERT(shared_buf); - return shared_buf->get_ptr(); - } +namespace cldnn { + +struct reorder_replication { + std::shared_ptr input_layout = nullptr; + std::shared_ptr reorder = nullptr; }; struct weightless_cache_manager { @@ -49,13 +66,8 @@ struct weightless_cache_manager { } } - void invalidate() { - do_weightless_caching = false; - } - - void set_new_dtype(ov::element::Type curr_dtype) { - this->curr_dtype = curr_dtype; - do_precision_conversion = original_dtype != curr_dtype; + void apply_reorder(std::shared_ptr input_layout, std::shared_ptr reorder) { + reorder_rep = {input_layout, reorder}; } bool save(BinaryOutputBuffer& ob, size_t data_size) const { @@ -76,15 +88,22 @@ struct weightless_cache_manager { ob << make_data(&num_dims, sizeof(size_t)); ob << make_data(shape.data(), num_dims * sizeof(ov::Shape::value_type)); } + + bool do_reorder = should_run_reorder(); + if (do_reorder) { + ob << true; + ob << *reorder_rep.input_layout; + ob << *reorder_rep.reorder; + } else { + ob << false; + } return true; } - std::shared_ptr load(BinaryInputBuffer& ib, - std::shared_ptr mapped_weights, - size_t data_size) { + bool load(BinaryInputBuffer& ib, memory::ptr dst_mem, std::shared_ptr mapped_weights) { ib >> do_weightless_caching; if (!do_weightless_caching) { - return nullptr; + return false; } OPENVINO_ASSERT(mapped_weights != nullptr, "mmap object is null"); @@ -101,70 +120,126 @@ struct weightless_cache_manager { shape.resize(num_dims); ib >> make_data(shape.data(), num_dims * sizeof(ov::Shape::value_type)); } else { - original_size = data_size; + original_size = dst_mem->size(); } - auto mem_obj = std::make_shared(); - mem_obj->shared_buf = std::make_shared>>( - mapped_weights->data() + bin_offset, - original_size, - mapped_weights); + bool do_reorder = false; + ib >> do_reorder; + if (do_reorder) { + reorder_rep.input_layout = std::make_shared(); + ib >> *reorder_rep.input_layout; + reorder_rep.reorder = std::make_shared(); + ib >> *reorder_rep.reorder; + } + + auto shared_buf = + std::make_shared>>(mapped_weights->data() + bin_offset, + original_size, + mapped_weights); if (should_run_transformations()) { - run_transformations(mem_obj); + run_transformations(ib.get_engine(), dst_mem, shared_buf); + } else { + copy_to_dst_mem(dst_mem, shared_buf->get_ptr()); } - return mem_obj; + return true; } + private: bool do_weightless_caching = false; bool do_precision_conversion = false; + reorder_replication reorder_rep{}; size_t bin_offset = SIZE_MAX; size_t original_size = SIZE_MAX; ov::element::Type original_dtype = ov::element::Type_t::undefined; ov::element::Type curr_dtype = ov::element::Type_t::undefined; - ov::Shape shape; + ov::Shape shape{}; + + bool should_run_reorder() const { + return reorder_rep.reorder != nullptr; + } bool should_run_transformations() { - return do_precision_conversion; + return do_precision_conversion || should_run_reorder(); } - void run_transformations(std::shared_ptr mem_obj) { - auto orig_constant = std::make_shared(original_dtype, - shape, - mem_obj->shared_buf->get_ptr(), - mem_obj->shared_buf); + void run_transformations(engine& engine, + memory::ptr dst_mem, + std::shared_ptr>> shared_buf) { + std::shared_ptr transformed_constant = nullptr; - ov::ParameterVector inputParams; - ov::ResultVector results; - results.push_back(std::make_shared(orig_constant->output(0))); - auto model = std::make_shared(results, inputParams, "aux"); + // Note: this works only until the data is copied to dst_mem. + auto get_intermediate_data = [&]() -> const uint8_t* { + if (transformed_constant) { + return reinterpret_cast(transformed_constant->get_data_ptr()); + } + return shared_buf->get_ptr(); + }; - ov::pass::Manager manager("Plugin:GPU:weightless_cache_transformations"); + // Note: this works only until the data is copied to dst_mem. + auto get_current_data_size = [&]() -> size_t { + if (transformed_constant) { + return transformed_constant->get_byte_size(); + } + return original_size; + }; if (do_precision_conversion) { - precisions_map fp_convert_precision_map = { - {original_dtype, curr_dtype}}; - type_to_fuse_map empty_fuse_map = {}; - const bool keep_precision_sensitive_in_fp32 = false; - const bool convert_input_output_precision = false; - const bool store_original_precision_as_rt_attribute = true; - manager.register_pass(fp_convert_precision_map, - empty_fuse_map, - keep_precision_sensitive_in_fp32, - convert_input_output_precision, - store_original_precision_as_rt_attribute); + auto orig_constant = std::make_shared(original_dtype, + shape, + get_intermediate_data(), + shared_buf); + + ov::ParameterVector inputParams; + ov::ResultVector results; + ov::pass::Manager manager("Plugin:GPU:weightless_cache_transformations"); + std::shared_ptr model = nullptr; + + auto convert_op = std::make_shared(orig_constant, curr_dtype); + results.push_back(std::make_shared(convert_op->output(0))); + model = std::make_shared(results, inputParams, "aux"); + manager.register_pass(); + + manager.run_passes(model); + const auto& ops = model->get_ops(); + auto it = std::find_if(ops.begin(), ops.end(), [](const std::shared_ptr& node) { + return ov::op::util::is_constant(node); + }); + OPENVINO_ASSERT(it != ops.end()); + transformed_constant = ov::as_type_ptr(*it); + OPENVINO_ASSERT(transformed_constant->get_element_type() == curr_dtype); } - manager.run_passes(model); - const auto& ops = model->get_ops(); - auto it = std::find_if(ops.begin(), ops.end(), [](const std::shared_ptr& node) { - return ov::op::util::is_constant(node); - }); - OPENVINO_ASSERT(it != ops.end()); - mem_obj->transformed_constant = ov::as_type_ptr(*it); - OPENVINO_ASSERT(mem_obj->transformed_constant->get_element_type() == curr_dtype); + if (should_run_reorder()) { + const auto allocation_type = dst_mem->get_allocation_type(); + memory::ptr input_mem = engine.allocate_memory(*reorder_rep.input_layout, allocation_type, false); + + if (is_alloc_host_accessible(allocation_type)) { + std::memcpy(reinterpret_cast(input_mem->buffer_ptr()), + get_intermediate_data(), + get_current_data_size()); + } else { + auto& strm = engine.get_service_stream(); + input_mem->copy_from(strm, get_intermediate_data()); + } + + reorder_rep.reorder->input = {input_info("input")}; + topology topology(input_layout("input", *reorder_rep.input_layout), + *reorder_rep.reorder); + cldnn::network network(engine, topology, {}); + network.set_input_data("input", input_mem); + network.set_output_memory(reorder_rep.reorder->id, dst_mem); + auto outputs = network.execute(); + for (const auto& output : outputs) { + output.second.get_event()->wait(); + } + + OPENVINO_ASSERT(outputs.size() == 1); + } else { + copy_to_dst_mem(dst_mem, get_intermediate_data()); + } } }; @@ -221,7 +296,7 @@ struct data : public primitive_base { bool do_weightless_caching = cache_info->save(ob, data_size); if (!do_weightless_caching) { - if (_allocation_type == allocation_type::usm_host || _allocation_type == allocation_type::usm_shared) { + if (is_alloc_host_accessible(_allocation_type)) { ob << make_data(mem->buffer_ptr(), data_size); } else { std::vector _buf; @@ -249,73 +324,54 @@ struct data : public primitive_base { mem = ib.get_engine().allocate_memory(output_layout, _allocation_type, false); - auto mem_obj = cache_info->load(ib, mapped_weights, data_size); - bool is_weightless_caching_enabled = mem_obj != nullptr; + bool is_weightless_caching = cache_info->load(ib, mem, mapped_weights); - if (_allocation_type == allocation_type::usm_host || _allocation_type == allocation_type::usm_shared) { - if (is_weightless_caching_enabled) { - std::memcpy(reinterpret_cast(mem->buffer_ptr()), mem_obj->get_loaded_data(), data_size); - } else { + if (!is_weightless_caching) { + if (is_alloc_host_accessible(_allocation_type)) { ib >> make_data(mem->buffer_ptr(), data_size); - } - } else { - const size_t DATA_BLOCK_SIZE = 2 * 1024 * 1024; - auto& strm = ib.get_engine().get_service_stream(); - if (data_size < DATA_BLOCK_SIZE || output_layout.format.is_image_2d()) { - std::vector _buf(data_size); - if (is_weightless_caching_enabled) { - std::memcpy(reinterpret_cast(_buf.data()), mem_obj->get_loaded_data(), data_size); - } else { - ib >> make_data(_buf.data(), data_size); - } - mem->copy_from(strm, _buf.data()); } else { - std::vector _buf1(DATA_BLOCK_SIZE); - std::vector _buf2(DATA_BLOCK_SIZE); - bool buf_flag = true; - event::ptr ev1, ev2; - ev1 = ev2 = nullptr; - size_t dst_offset = 0; - while (dst_offset < data_size) { - const bool is_blocking = false; - const size_t src_offset = 0; - size_t copy_size = - (data_size > (dst_offset + DATA_BLOCK_SIZE)) ? DATA_BLOCK_SIZE : (data_size - dst_offset); - if (buf_flag) { - if (is_weightless_caching_enabled) { - std::memcpy(reinterpret_cast(_buf1.data()), - mem_obj->get_loaded_data() + dst_offset, - copy_size); - } else { + const size_t DATA_BLOCK_SIZE = 2 * 1024 * 1024; + auto& strm = ib.get_engine().get_service_stream(); + if (data_size < DATA_BLOCK_SIZE || output_layout.format.is_image_2d()) { + std::vector _buf(data_size); + ib >> make_data(_buf.data(), data_size); + mem->copy_from(strm, _buf.data()); + } else { + std::vector _buf1(DATA_BLOCK_SIZE); + std::vector _buf2(DATA_BLOCK_SIZE); + bool buf_flag = true; + event::ptr ev1, ev2; + ev1 = ev2 = nullptr; + size_t dst_offset = 0; + while (dst_offset < data_size) { + const bool is_blocking = false; + const size_t src_offset = 0; + size_t copy_size = + (data_size > (dst_offset + DATA_BLOCK_SIZE)) ? DATA_BLOCK_SIZE : (data_size - dst_offset); + if (buf_flag) { ib >> make_data(_buf1.data(), copy_size); - } - if (ev2 != nullptr) { - ev2->wait(); - ev2 = nullptr; - } - ev1 = mem->copy_from(strm, _buf1.data(), src_offset, dst_offset, copy_size, is_blocking); - } else { - if (is_weightless_caching_enabled) { - std::memcpy(reinterpret_cast(_buf2.data()), - mem_obj->get_loaded_data() + dst_offset, - copy_size); + if (ev2 != nullptr) { + ev2->wait(); + ev2 = nullptr; + } + ev1 = mem->copy_from(strm, _buf1.data(), src_offset, dst_offset, copy_size, is_blocking); } else { ib >> make_data(_buf2.data(), copy_size); + if (ev1 != nullptr) { + ev1->wait(); + ev1 = nullptr; + } + ev2 = mem->copy_from(strm, _buf2.data(), src_offset, dst_offset, copy_size, is_blocking); } - if (ev1 != nullptr) { - ev1->wait(); - ev1 = nullptr; - } - ev2 = mem->copy_from(strm, _buf2.data(), src_offset, dst_offset, copy_size, is_blocking); + dst_offset += DATA_BLOCK_SIZE; + buf_flag = !buf_flag; + } + if (ev2 != nullptr) { + ev2->wait(); + } + if (ev1 != nullptr) { + ev1->wait(); } - dst_offset += DATA_BLOCK_SIZE; - buf_flag = !buf_flag; - } - if (ev2 != nullptr) { - ev2->wait(); - } - if (ev1 != nullptr) { - ev1->wait(); } } } diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/propagate_constants.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/propagate_constants.cpp index 34fa9647ec99c3..34314155837197 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/propagate_constants.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/propagate_constants.cpp @@ -77,11 +77,13 @@ void propagate_constants::run(program& p) { auto& id_to_replace = std::get<0>(cout); auto mem_impl = std::get<1>(cout); auto cache_info = std::get<2>(cout); - auto in_layout = std::get<3>(cout); + auto cache_manager = std::get<0>(cache_info); + auto in_layout = std::get<1>(cache_info); + auto reorder = std::get<2>(cache_info); auto const_data = std::make_shared("_cldnn_const_prop_" + id_to_replace, mem_impl, /* <<< REMOVE ME WHEN POSSIBLE */ - cache_info); + cache_manager); auto& new_node = p.get_or_create(const_data); auto& curr_node = p.get_node(id_to_replace); @@ -95,23 +97,8 @@ void propagate_constants::run(program& p) { } } - auto is_reorder_with_only_dtype_change = [&](program_node& dst) { - if (!in_layout) { - return false; - } - auto& dst_layout = dst.get_output_layout(); - if (in_layout->data_type == dst_layout.data_type) { - return false; - } - - auto aux_layout = dst_layout; - aux_layout.data_type = in_layout->data_type; - return aux_layout == *in_layout.get(); - }; - if (is_reorder_with_only_dtype_change(new_node)) { - new_node.as().get_primitive()->cache_info->set_new_dtype(new_node.get_output_layout().data_type); - } else { - new_node.as().get_primitive()->cache_info->invalidate(); + if (in_layout && reorder) { + new_node.as().get_primitive()->cache_info->apply_reorder(in_layout, reorder); } curr_node.dependencies.clear(); @@ -135,7 +122,10 @@ bool propagate_constants::has_non_const_user(program_node& node) const { return false; } -std::list, std::shared_ptr>> +using cache_tuple = + std::tuple, std::shared_ptr, std::shared_ptr>; + +std::list> propagate_constants::calculate(engine& engine, const ExecutionConfig& config, std::shared_ptr task_executor) { @@ -146,18 +136,18 @@ propagate_constants::calculate(engine& engine, cf_config.set_property(ov::intel_gpu::optimize_data(false)); cf_config.set_property(ov::intel_gpu::custom_outputs(const_outputs)); network::ptr net = network::build_network(engine, nodes, cf_config, task_executor, true); - std::map, std::shared_ptr>> - weightless_cache_map; + std::map weightless_cache_map; for (auto& cin : const_inputs) { net->set_input_data(cin->id(), cin->get_attached_memory_ptr()); auto users = cin->get_users(); if (users.size() == 1 && users.front()->is_type()) { auto rprim = users.front()->as().get_primitive(); + auto copy = std::shared_ptr(new reorder(*rprim)); auto id = rprim->id; auto cache_ptr = cin->as().get_primitive()->cache_info; auto layout_ptr = std::make_shared(cin->get_output_layout()); - weightless_cache_map.emplace(id, std::make_pair(cache_ptr, layout_ptr)); + weightless_cache_map.emplace(id, std::make_tuple(cache_ptr, layout_ptr, copy)); } } @@ -165,17 +155,15 @@ propagate_constants::calculate(engine& engine, net->reset_execution(true); // wait for computations to complete auto outputs = net->get_outputs(); - std::list, std::shared_ptr>> + std::list> ret; for (auto& out : outputs) { - std::shared_ptr cache_ptr = nullptr; - std::shared_ptr layout_ptr = nullptr; + cache_tuple cache_info{}; auto it = weightless_cache_map.find(out->id()); if (it != weightless_cache_map.end()) { - cache_ptr = it->second.first; - layout_ptr = it->second.second; + cache_info = it->second; } - ret.push_back({out->id(), out->output_memory_ptr(), cache_ptr, layout_ptr}); + ret.push_back({out->id(), out->output_memory_ptr(), cache_info}); } return ret; diff --git a/src/plugins/intel_gpu/src/graph/include/pass_manager.h b/src/plugins/intel_gpu/src/graph/include/pass_manager.h index 0bdcf4ef82672e..fb7f05927b249c 100644 --- a/src/plugins/intel_gpu/src/graph/include/pass_manager.h +++ b/src/plugins/intel_gpu/src/graph/include/pass_manager.h @@ -222,7 +222,10 @@ class propagate_constants : public base_pass { private: void run(program& p) override; - std::list, std::shared_ptr>> + std::list, std::shared_ptr, std::shared_ptr>>> calculate(engine& engine, const ExecutionConfig& config, std::shared_ptr task_executor); diff --git a/src/plugins/intel_gpu/src/plugin/transformations/convert_fc_to_compressed.cpp b/src/plugins/intel_gpu/src/plugin/transformations/convert_fc_to_compressed.cpp index f5062b4c2028cc..0e579843006d7c 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/convert_fc_to_compressed.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/convert_fc_to_compressed.cpp @@ -3,22 +3,23 @@ // #include "convert_fc_to_compressed.hpp" + #include #include "intel_gpu/op/fully_connected.hpp" #include "intel_gpu/op/fully_connected_compressed.hpp" - +#include "openvino/core/rt_info.hpp" +#include "openvino/core/rt_info/weightless_caching_attributes.hpp" #include "openvino/op/constant.hpp" -#include "openvino/op/subtract.hpp" +#include "openvino/op/convert.hpp" #include "openvino/op/matmul.hpp" #include "openvino/op/multiply.hpp" -#include "openvino/op/convert.hpp" -#include "openvino/op/transpose.hpp" #include "openvino/op/reshape.hpp" -#include "openvino/core/rt_info.hpp" +#include "openvino/op/subtract.hpp" +#include "openvino/op/transpose.hpp" +#include "openvino/pass/pattern/op/or.hpp" #include "openvino/pass/pattern/op/pattern.hpp" #include "openvino/pass/pattern/op/wrap_type.hpp" -#include "openvino/pass/pattern/op/or.hpp" #include "transformations/utils/utils.hpp" namespace ov { @@ -103,20 +104,27 @@ ConvertFullyConnectedToFullyConnectedCompressed::ConvertFullyConnectedToFullyCon auto new_shape = (has_transpose || !grouped) ? ov::Shape{current_shape[0] * current_shape[1], current_shape[2]} : ov::Shape{current_shape[0], current_shape[1] * current_shape[2]}; - return std::make_shared(*constant, new_shape); + auto new_constant = std::make_shared(*constant, new_shape); + + ov::copy_weightless_cache_attr(constant, new_constant); + return new_constant; }; auto convert_const_to_u8 = [&](std::shared_ptr node) { auto constant = ov::as_type_ptr(node); + std::shared_ptr result = nullptr; // Convert ZP to u8 if (constant->get_element_type() == ov::element::u8) - return std::dynamic_pointer_cast(constant); - if (constant->get_element_type() == ov::element::u4) - return std::dynamic_pointer_cast(std::make_shared(node, ov::element::u8)); - if (weight_u8 && sub_with_convert) - return std::dynamic_pointer_cast(std::make_shared(node, ov::element::u8)); - - return std::dynamic_pointer_cast(constant); + result = std::dynamic_pointer_cast(constant); + else if (constant->get_element_type() == ov::element::u4) + result = std::dynamic_pointer_cast(std::make_shared(node, ov::element::u8)); + else if (weight_u8 && sub_with_convert) + result = std::dynamic_pointer_cast(std::make_shared(node, ov::element::u8)); + else + result = std::dynamic_pointer_cast(constant); + + ov::copy_weightless_cache_attr(node, result); + return result; }; diff --git a/src/plugins/intel_gpu/tests/functional/behavior/model_cache.cpp b/src/plugins/intel_gpu/tests/functional/behavior/model_cache.cpp index 1f911d4a0f2070..1257ee02d1e69b 100644 --- a/src/plugins/intel_gpu/tests/functional/behavior/model_cache.cpp +++ b/src/plugins/intel_gpu/tests/functional/behavior/model_cache.cpp @@ -2,6 +2,9 @@ // SPDX-License-Identifier: Apache-2.0 // +#include +#include + #include #include "base/ov_behavior_test_utils.hpp" @@ -14,6 +17,14 @@ #include "common_test_utils/test_common.hpp" #include "openvino/pass/serialize.hpp" #include "openvino/util/codec_xor.hpp" +#include "shared_test_classes/subgraph/weights_decompression_builders.hpp" +#ifndef WIN32 +# include +#endif + +#ifdef WIN32 +# define stat _stat +#endif namespace { typedef std::tuple testParams; @@ -30,7 +41,7 @@ class CheckWeightlessCacheAccuracy : public ::testing::Test, public ::testing::W std::ostringstream result; const char separator = '_'; result << "use_compile_model_api=" << use_compile_model_api_ << separator; - result << "_do_encryption=" << do_encryption_; + result << "do_encryption=" << do_encryption_ << separator; result << "inference_mode=" << inference_mode_ << separator; result << "model_dtype=" << model_dtype_; return result.str(); @@ -99,14 +110,44 @@ void CheckWeightlessCacheAccuracy::run() { ofstr.close(); } - auto ifstr = std::ifstream(cache_path, std::ifstream::binary); + auto get_cache_path = [&]() { + std::string path; + if (use_compile_model_api) { + auto blobs = ov::test::utils::listFilesWithExt(cache_dir, "blob"); + EXPECT_EQ(blobs.size(), 1); + path = blobs[0]; + } else { + path = cache_path; + } + return path; + }; + + auto get_mod_time = [&](const std::string& path) { + struct stat result; + if (stat(path.c_str(), &result) == 0) { + return result.st_mtime; + } + return static_cast(0); + }; + + auto first_cache_path = get_cache_path(); + auto first_mod_time = get_mod_time(first_cache_path); + ASSERT_NE(first_mod_time, static_cast(0)); + ov::CompiledModel imported_model; if (use_compile_model_api) { imported_model = core->compile_model(xml_path, ov::test::utils::DEVICE_GPU, config); } else { + auto ifstr = std::ifstream(cache_path, std::ifstream::binary); imported_model = core->import_model(ifstr, ov::test::utils::DEVICE_GPU, config_with_weights_path); + ifstr.close(); } - ifstr.close(); + + auto second_cache_path = get_cache_path(); + auto second_mod_time = get_mod_time(second_cache_path); + + // Something went wrong if a new cache is created during the second run. + ASSERT_EQ(first_mod_time, second_mod_time); auto orig_req = compiled_model.create_infer_request(); auto new_req = imported_model.create_infer_request(); @@ -148,6 +189,35 @@ TEST_P(CheckWeightlessCacheAccuracy, TiWithLstmCell) { OV_ASSERT_NO_THROW(run()); } +class CheckWeightlessCacheAccuracyLowPrecision : public CheckWeightlessCacheAccuracy {}; + +TEST_P(CheckWeightlessCacheAccuracyLowPrecision, MatmulWeightsDecompression) { + ov::test::MatMulDecompressionShapeParams shape_params{{{}, {{1, 4, 16}}}, {1, 16, 32}}; + auto dynShape = shape_params.data_shape.first; + if (dynShape.rank() == 0) { + dynShape = shape_params.data_shape.second.front(); + } + ov::ParameterVector params{std::make_shared(ov::element::f32, dynShape)}; + const auto weights_subgraph = ov::test::initMatMulDecompressionSubgraph(shape_params.weights_shape, + shape_params.decompression_group_size, + ov::element::f32, + model_dtype, + ov::element::f32, + ov::element::undefined, + true, + ov::test::DecompressionType::full, + ov::test::DecompressionType::full, + false); + auto matmul = std::make_shared(params[0], weights_subgraph); + + ov::ResultVector results; + for (const auto& output : matmul->outputs()) { + results.push_back(std::make_shared(output)); + } + model = std::make_shared(results, params, "MatmulWeightsDecompression"); + OV_ASSERT_NO_THROW(run()); +} + const std::vector inference_modes = { ov::element::f32, ov::element::f16, @@ -159,6 +229,12 @@ const std::vector model_dtypes = { ov::element::bf16, }; +const std::vector low_precision_dtypes = { + ov::element::u8, + ov::element::u4, + ov::element::i4, +}; + INSTANTIATE_TEST_SUITE_P(smoke_CheckWeightlessCacheAccuracy, CheckWeightlessCacheAccuracy, ::testing::Combine(::testing::Bool(), @@ -167,4 +243,12 @@ INSTANTIATE_TEST_SUITE_P(smoke_CheckWeightlessCacheAccuracy, ::testing::ValuesIn(model_dtypes)), CheckWeightlessCacheAccuracy::get_test_case_name); +INSTANTIATE_TEST_SUITE_P(smoke_CheckWeightlessCacheAccuracyLowPrecision, + CheckWeightlessCacheAccuracyLowPrecision, + ::testing::Combine(::testing::Bool(), + ::testing::Bool(), + ::testing::ValuesIn(inference_modes), + ::testing::ValuesIn(low_precision_dtypes)), + CheckWeightlessCacheAccuracy::get_test_case_name); + } // namespace