From 40b19c83f60bd7416da7339caae3d9a4fcf2537f Mon Sep 17 00:00:00 2001
From: Alexey Smirnov <alexey.smirnov@intel.com>
Date: Wed, 15 Jan 2025 19:49:28 +0000
Subject: [PATCH] [NPUW] Serialization fixes (#28442)

Co-authored-by: Dmitry Matveev <dmitry.matveev@intel.com>
---
 .../src/plugin/npuw/compiled_model.cpp        | 29 +++++-
 .../src/plugin/npuw/llm_compiled_model.cpp    |  9 +-
 .../src/plugin/npuw/serialization.cpp         | 92 +++++++++++++++++++
 .../src/plugin/npuw/serialization.hpp         |  5 +
 .../src/plugin/npuw/weights_bank.cpp          |  7 +-
 .../intel_npu/src/plugin/src/plugin.cpp       |  4 -
 6 files changed, 136 insertions(+), 10 deletions(-)
diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
index 6ec6e047dddb8d..f0d9950c2e3520 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
@@ -133,9 +133,18 @@ std::shared_ptr<ov::npuw::ICompiledModel> ov::npuw::ICompiledModel::create(
     auto use_llm_key = ov::intel_npu::npuw::llm::enabled.name();
     if (properties.count(use_llm_key) && properties.at(use_llm_key).as<bool>() == true) {
         LOG_INFO("ov::npuw::LLMCompiledModel will be created.");
-        compiled_model = std::make_shared<ov::npuw::LLMCompiledModel>(model, plugin, properties);
+        // Drop CACHE_DIR from the config
+        // If it's present we will be utilizing LLMCompiledModel's import
+        // and not the underlying models and submodels
+        auto config = properties;
+        config.erase(ov::cache_dir.name());
+        compiled_model = std::make_shared<ov::npuw::LLMCompiledModel>(model, plugin, config);
     } else {
         LOG_INFO("ov::npuw::CompiledModel will be created.");
+        // CACHE_DIR isn't supported with NPU_USE_NPUW
+        if (properties.count(ov::cache_dir.name())) {
+            OPENVINO_THROW("Option 'CACHE_DIR' is not supported with configuration: NPU_USE_NPUW : YES, NPUW_LLM : NO");
+        }
         pre_load_transform(model, properties);
         compiled_model = std::make_shared<ov::npuw::CompiledModel>(model, plugin, properties);
     }
@@ -611,6 +620,12 @@ void ov::npuw::CompiledModel::serialize(std::ostream& stream) const {
 
     // Write config
     write(stream, m_cfg);
+    // FIXME: utilize overload instead
+    write(stream, m_non_npuw_props.size());
+    for (const auto& p : m_non_npuw_props) {
+        write(stream, p.first);
+        write_any(stream, p.second);
+    }
 
     // Serialize compiled submodels
     write(stream, m_compiled_submodels.size());
@@ -671,6 +686,18 @@ std::shared_ptr<ov::npuw::CompiledModel> ov::npuw::CompiledModel::deserialize(
 
     // Deserialize config
     read(stream, compiled->m_cfg);
+    compiled->m_cfg.parseEnvVars();
+    // FIXME: utilize overload instead
+    std::size_t props_size;
+    read(stream, props_size);
+    for (std::size_t i = 0; i < props_size; ++i) {
+        std::string key;
+        read(stream, key);
+        ov::Any val;
+        read_any(stream, val);
+        compiled->m_non_npuw_props[key] = val;
+    }
+    compiled->implement_properties();
 
     // Deserialize compiled submodels
     std::size_t subm_size = 0;
diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp
index 853c839ff6faa1..fb31f7ed0770bb 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp
@@ -611,21 +611,21 @@ std::shared_ptr<ov::npuw::LLMCompiledModel> ov::npuw::LLMCompiledModel::deserial
     if (vmajor != OPENVINO_VERSION_MAJOR || vminor != OPENVINO_VERSION_MINOR || vpatch != OPENVINO_VERSION_PATCH ||
         s11n_version != std::string(NPUW_SERIALIZATION_VERSION)) {
         OPENVINO_THROW("This blobs was serialized with different OV version!",
-                       " Serialized by OV ",
+                       "\nSerialized by OV ",
                        vmajor,
                        '.',
                        vminor,
                        '.',
                        vpatch,
-                       " Current OV version ",
+                       "\nCurrent OV version ",
                        OPENVINO_VERSION_MAJOR,
                        '.',
                        OPENVINO_VERSION_MINOR,
                        '.',
                        OPENVINO_VERSION_PATCH,
-                       " NPUW serialized by version ",
+                       "\nNPUW serialized by version ",
                        s11n_version,
-                       " NPUW current serialization version ",
+                       "\nNPUW current serialization version ",
                        NPUW_SERIALIZATION_VERSION);
     }
 
@@ -653,6 +653,7 @@ std::shared_ptr<ov::npuw::LLMCompiledModel> ov::npuw::LLMCompiledModel::deserial
 
     // Deserialize config
     read(stream, compiled->m_cfg);
+    compiled->implement_properties();
 
     // Deserialize CompiledModels
     compiled->m_kvcache_compiled = ov::npuw::CompiledModel::deserialize(stream, plugin);
diff --git a/src/plugins/intel_npu/src/plugin/npuw/serialization.cpp b/src/plugins/intel_npu/src/plugin/npuw/serialization.cpp
index 550a1fdd384499..60417f9f241732 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/serialization.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/serialization.cpp
@@ -23,6 +23,10 @@ void ov::npuw::s11n::write(std::ostream& stream, const bool& var) {
     stream.write(reinterpret_cast<const char*>(&var), sizeof var);
 }
 
+void ov::npuw::s11n::write(std::ostream& stream, const float& var) {
+    stream.write(reinterpret_cast<const char*>(&var), sizeof var);
+}
+
 void ov::npuw::s11n::write(std::ostream& stream, const ov::npuw::compiled::Spatial& var) {
     using ov::npuw::s11n::write;
 
@@ -74,6 +78,43 @@ void ov::npuw::s11n::write(std::ostream& stream, const ov::Output<const ov::Node
     write(stream, var.get_names());
 }
 
+enum class AnyType : int { STRING = 0, CHARS, INT, UINT32, INT64, UINT64, SIZET, FLOAT, BOOL };
+
+void ov::npuw::s11n::write_any(std::ostream& stream, const ov::Any& var) {
+    // FIXME: figure out a proper way to serialize Any (for config)
+    if (var.is<std::string>()) {
+        write(stream, static_cast<int>(AnyType::STRING));
+        write(stream, var.as<std::string>());
+    } else if (var.is<const char*>()) {
+        // FIXME: handle properly
+        write(stream, static_cast<int>(AnyType::CHARS));
+        write(stream, std::string(var.as<const char*>()));
+    } else if (var.is<std::size_t>()) {
+        write(stream, static_cast<int>(AnyType::SIZET));
+        write(stream, var.as<std::size_t>());
+    } else if (var.is<int>()) {
+        write(stream, static_cast<int>(AnyType::INT));
+        write(stream, var.as<int>());
+    } else if (var.is<int64_t>()) {
+        write(stream, static_cast<int>(AnyType::INT64));
+        write(stream, var.as<int64_t>());
+    } else if (var.is<uint32_t>()) {
+        write(stream, static_cast<int>(AnyType::UINT32));
+        write(stream, var.as<uint32_t>());
+    } else if (var.is<uint64_t>()) {
+        write(stream, static_cast<int>(AnyType::UINT64));
+        write(stream, var.as<uint64_t>());
+    } else if (var.is<float>()) {
+        write(stream, static_cast<int>(AnyType::FLOAT));
+        write(stream, var.as<float>());
+    } else if (var.is<bool>()) {
+        write(stream, static_cast<int>(AnyType::BOOL));
+        write(stream, var.as<bool>());
+    } else {
+        NPUW_ASSERT(false && "Unsupported type");
+    }
+}
+
 void ov::npuw::s11n::read(std::istream& stream, std::streampos& var) {
     stream.read(reinterpret_cast<char*>(&var), sizeof var);
 }
@@ -89,6 +130,10 @@ void ov::npuw::s11n::read(std::istream& stream, bool& var) {
     stream.read(reinterpret_cast<char*>(&var), sizeof var);
 }
 
+void ov::npuw::s11n::read(std::istream& stream, float& var) {
+    stream.read(reinterpret_cast<char*>(&var), sizeof var);
+}
+
 void ov::npuw::s11n::read(std::istream& stream, ov::npuw::compiled::Spatial& var) {
     using ov::npuw::s11n::read;
 
@@ -169,3 +214,50 @@ void ov::npuw::s11n::read(std::istream& stream, std::shared_ptr<ov::Node>& var)
     var->output(0).set_tensor_ptr(tensor_dummy);
     var->set_friendly_name(*names.begin());  // any_name ?
 }
+
+void ov::npuw::s11n::read_any(std::istream& stream, ov::Any& var) {
+    // FIXME: ugly, but cannot use .read(stream) here due to its usage of operator>>()
+    int type_int;
+    read(stream, type_int);
+    AnyType type = static_cast<AnyType>(type_int);
+    if (type == AnyType::STRING) {
+        std::string val;
+        read(stream, val);
+        var = std::move(val);
+    } else if (type == AnyType::CHARS) {
+        // FIXME: handle properly
+        std::string val;
+        read(stream, val);
+        var = std::move(val);
+    } else if (type == AnyType::SIZET) {
+        std::size_t val;
+        read(stream, val);
+        var = val;
+    } else if (type == AnyType::INT) {
+        int val;
+        read(stream, val);
+        var = val;
+    } else if (type == AnyType::INT64) {
+        int64_t val;
+        read(stream, val);
+        var = val;
+    } else if (type == AnyType::UINT32) {
+        uint32_t val;
+        read(stream, val);
+        var = val;
+    } else if (type == AnyType::UINT64) {
+        uint64_t val;
+        read(stream, val);
+        var = val;
+    } else if (type == AnyType::FLOAT) {
+        float val;
+        read(stream, val);
+        var = val;
+    } else if (type == AnyType::BOOL) {
+        bool val;
+        read(stream, val);
+        var = val;
+    } else {
+        NPUW_ASSERT(false && "Unsupported type");
+    }
+}
diff --git a/src/plugins/intel_npu/src/plugin/npuw/serialization.hpp b/src/plugins/intel_npu/src/plugin/npuw/serialization.hpp
index 77a6b3aa865254..170631f644da12 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/serialization.hpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/serialization.hpp
@@ -27,6 +27,7 @@ class Config;
 namespace ov {
 
 // Forward declaration
+class Any;
 class Node;
 class Tensor;
 template <class>
@@ -52,19 +53,23 @@ namespace s11n {
 void write(std::ostream& stream, const std::streampos& var);
 void write(std::ostream& stream, const std::string& var);
 void write(std::ostream& stream, const bool& var);
+void write(std::ostream& stream, const float& var);
 void write(std::ostream& stream, const ov::npuw::compiled::Spatial& var);
 void write(std::ostream& stream, const ov::Tensor& var);
 void write(std::ostream& stream, const ::intel_npu::Config& var);
 void write(std::ostream& stream, const ov::Output<const ov::Node>& var);
+void write_any(std::ostream& stream, const ov::Any& var);
 
 void read(std::istream& stream, std::streampos& var);
 void read(std::istream& stream, std::string& var);
 void read(std::istream& stream, bool& var);
+void read(std::istream& stream, float& var);
 void read(std::istream& stream, ov::npuw::compiled::Spatial& var);
 void read(std::istream& stream, ov::Tensor& var);
 void read(std::istream& stream, ::intel_npu::Config& var);
 void read(std::istream& stream, std::shared_ptr<ov::op::v0::Parameter>& var);
 void read(std::istream& stream, std::shared_ptr<ov::Node>& var);
+void read_any(std::istream& stream, ov::Any& var);
 
 // Forward declaration
 template <typename T1, typename T2>
diff --git a/src/plugins/intel_npu/src/plugin/npuw/weights_bank.cpp b/src/plugins/intel_npu/src/plugin/npuw/weights_bank.cpp
index c554d1dcb81b18..125c8b3ab52f99 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/weights_bank.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/weights_bank.cpp
@@ -219,7 +219,8 @@ void Bank::read_and_add_tensor(std::istream& stream, int64_t uid, const std::str
     auto iter_device = device_bank.storage.find(uid);
 
     if (iter_device != device_bank.storage.end()) {
-        // Already allocated
+        // Shouldn't be possible
+        NPUW_ASSERT(false);
         return;
     }
 
@@ -234,6 +235,10 @@ void Bank::read_and_add_tensor(std::istream& stream, int64_t uid, const std::str
     ov::Tensor allocated_tensor;
 
     // FIXME: reading not via a dedicated function
+    bool is_intialized = false;
+    read(stream, is_intialized);
+    NPUW_ASSERT(is_intialized);
+
     std::string type_str;
     read(stream, type_str);
     ov::element::Type type(type_str);
diff --git a/src/plugins/intel_npu/src/plugin/src/plugin.cpp b/src/plugins/intel_npu/src/plugin/src/plugin.cpp
index 5d5f666f5119ec..301a71887054f2 100644
--- a/src/plugins/intel_npu/src/plugin/src/plugin.cpp
+++ b/src/plugins/intel_npu/src/plugin/src/plugin.cpp
@@ -622,10 +622,6 @@ std::shared_ptr<ov::ICompiledModel> Plugin::compile_model(const std::shared_ptr<
     ov::AnyMap localProperties = properties;
     if (localProperties.count(useNpuwKey)) {
         if (localProperties.at(useNpuwKey).as<bool>() == true) {
-            // CACHE_DIR isn't supported with NPU_USE_NPUW
-            if (localProperties.count(ov::cache_dir.name()) || !_globalConfig.get<CACHE_DIR>().empty()) {
-                OPENVINO_THROW("Option 'CACHE_DIR' is not supported with NPU_USE_NPUW!");
-            }
             return ov::npuw::ICompiledModel::create(model->clone(), shared_from_this(), localProperties);
         } else {
             // NPUW is disabled, remove the key from the properties