diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/execution_config.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/execution_config.hpp index 113fa73e979b1b..e725f3658dfda2 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/execution_config.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/execution_config.hpp @@ -150,7 +150,6 @@ class ExecutionConfig { void apply_performance_hints(const cldnn::device_info& info); void apply_priority_hints(const cldnn::device_info& info); void apply_debug_options(const cldnn::device_info& info); - void update_specific_default_properties(const cldnn::device_info& info); template void apply_rt_info_property(const ov::Property& property, const ov::RTMap& rt_info) { @@ -169,7 +168,7 @@ class ExecutionConfig { std::map supported_properties; std::map property_validators; - bool specific_default_properties_is_set = false; + bool finalized = false; }; } // namespace intel_gpu diff --git a/src/plugins/intel_gpu/src/graph/program.cpp b/src/plugins/intel_gpu/src/graph/program.cpp index 5072740240e2a5..a8224c2e363f62 100644 --- a/src/plugins/intel_gpu/src/graph/program.cpp +++ b/src/plugins/intel_gpu/src/graph/program.cpp @@ -530,6 +530,7 @@ void program::init_graph() { node->get_output_layouts(); if (node->is_type()) { _config.set_property(ov::intel_gpu::use_onednn(true)); + _config.set_property(ov::intel_gpu::queue_type(QueueTypes::in_order)); } } // Perform initial shape_of subgraphs markup diff --git a/src/plugins/intel_gpu/src/plugin/plugin.cpp b/src/plugins/intel_gpu/src/plugin/plugin.cpp index 0d365ef689608f..a02125d3b6cc18 100644 --- a/src/plugins/intel_gpu/src/plugin/plugin.cpp +++ b/src/plugins/intel_gpu/src/plugin/plugin.cpp @@ -240,10 +240,12 @@ std::shared_ptr Plugin::compile_model(const std::shared_ptr< auto context_impl = get_context_impl(context); auto device_id = ov::DeviceIDParser{context_impl->get_device_name()}.get_device_id(); - OPENVINO_ASSERT(m_configs_map.find(device_id) != m_configs_map.end(), "[GPU] LoadExeNetworkImpl: Couldn't find config for GPU with id ", device_id); + OPENVINO_ASSERT(m_configs_map.find(device_id) != m_configs_map.end(), "[GPU] compile_model: Couldn't find config for GPU with id ", device_id); ExecutionConfig config = m_configs_map.at(device_id); config.set_user_property(orig_config); + if (model->has_rt_info("runtime_options")) + config.apply_rt_info(context_impl->get_engine().get_device_info(), model->get_rt_info("runtime_options"), is_llm(model)); config.apply_user_properties(context_impl->get_engine().get_device_info()); set_cache_info(model, config); diff --git a/src/plugins/intel_gpu/src/runtime/execution_config.cpp b/src/plugins/intel_gpu/src/runtime/execution_config.cpp index fde86c92778ab3..51fadb49c286e7 100644 --- a/src/plugins/intel_gpu/src/runtime/execution_config.cpp +++ b/src/plugins/intel_gpu/src/runtime/execution_config.cpp @@ -5,6 +5,7 @@ #include "intel_gpu/runtime/execution_config.hpp" #include "intel_gpu/runtime/debug_configuration.hpp" #include "openvino/runtime/internal_properties.hpp" +#include "openvino/runtime/properties.hpp" #include @@ -59,7 +60,7 @@ void ExecutionConfig::set_default() { std::make_tuple(ov::cache_mode, ov::CacheMode::OPTIMIZE_SPEED), std::make_tuple(ov::cache_encryption_callbacks, EncryptionCallbacks{}), std::make_tuple(ov::hint::dynamic_quantization_group_size, 0), - std::make_tuple(ov::hint::kv_cache_precision, ov::element::undefined), + std::make_tuple(ov::hint::kv_cache_precision, ov::element::f16), std::make_tuple(ov::intel_gpu::hint::enable_kernels_reuse, false), std::make_tuple(ov::weights_path, ""), std::make_tuple(ov::hint::activations_scale_factor, -1.f), @@ -230,26 +231,9 @@ void ExecutionConfig::apply_hints(const cldnn::device_info& info) { apply_debug_options(info); } -void ExecutionConfig::update_specific_default_properties(const cldnn::device_info& info) { - // These default properties should be set once. - if (specific_default_properties_is_set) - return; - specific_default_properties_is_set = true; - - // Enable KV-cache compression by default for non-systolic platforms MFDNN-11755 - if (get_property(ov::hint::kv_cache_precision) == ov::element::undefined && !info.supports_immad) { - set_property(ov::hint::kv_cache_precision(ov::element::i8)); - } - - // Enable dynamic quantization by default for non-systolic platforms - if (get_property(ov::hint::dynamic_quantization_group_size) == 0 && !info.supports_immad) { - set_property(ov::hint::dynamic_quantization_group_size(32)); - } -} - void ExecutionConfig::apply_user_properties(const cldnn::device_info& info) { - // Update specific default properties, call once before internal_properties updated. - update_specific_default_properties(info); + if (finalized) + return; // Copy internal properties before applying hints to ensure that // a property set by hint won't be overriden by a value in user config. @@ -280,6 +264,23 @@ void ExecutionConfig::apply_user_properties(const cldnn::device_info& info) { } } + if (!is_set_by_user(ov::hint::kv_cache_precision) || get_property(ov::hint::kv_cache_precision) == ov::element::undefined) { + if (info.supports_immad) { // MFDNN-11755 + set_property(ov::hint::kv_cache_precision(get_property(ov::hint::inference_precision))); + } else { + // Enable KV-cache compression by default for non-systolic platforms only + set_property(ov::hint::kv_cache_precision(ov::element::i8)); + } + } + + // Enable dynamic quantization by default for non-systolic platforms + if (!is_set_by_user(ov::hint::dynamic_quantization_group_size) && + get_property(ov::hint::dynamic_quantization_group_size) == 0 && !info.supports_immad) { + set_property(ov::hint::dynamic_quantization_group_size(32)); + } + + finalized = true; + user_properties.clear(); } diff --git a/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/kv_cache.cpp b/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/kv_cache.cpp index 4945cc8d717be3..32adbeeba273f3 100644 --- a/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/kv_cache.cpp +++ b/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/kv_cache.cpp @@ -539,7 +539,7 @@ class KVCacheIssueTests: public ::testing::Test { auto core = ov::test::utils::PluginCache::get().core(); ov::AnyMap properties = { - ov::hint::kv_cache_precision(ov::element::undefined) + ov::hint::kv_cache_precision(ov::element::f16) }; const size_t n_batch = 1; diff --git a/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/kv_cache_sdpa.cpp b/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/kv_cache_sdpa.cpp index 7bb4a7385bcdc4..71eeba9f6673a5 100644 --- a/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/kv_cache_sdpa.cpp +++ b/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/kv_cache_sdpa.cpp @@ -50,7 +50,7 @@ class SDPAWithKVCacheTest : public ::testing::Test, public ::testing::WithParamI if (p.compressed) { properties.emplace(ov::hint::kv_cache_precision(ov::element::i8)); } else { - properties.emplace(ov::hint::kv_cache_precision(ov::element::undefined)); + properties.emplace(ov::hint::kv_cache_precision(ov::element::f16)); } const size_t n_heads = 16;