From 3076e0bf2661adba4533ddb6f676b5a8123351fa Mon Sep 17 00:00:00 2001
From: Adam Treat <treat.adam@gmail.com>
Date: Thu, 14 Sep 2023 09:59:19 -0400
Subject: [PATCH] Only show GPU when we're actually using it.

---
 gpt4all-backend/llamamodel.cpp    | 10 ++++++++++
 gpt4all-backend/llamamodel_impl.h |  1 +
 gpt4all-backend/llmodel.h         |  1 +
 gpt4all-backend/replit.cpp        | 14 +++++++++++---
 gpt4all-backend/replit_impl.h     |  1 +
 gpt4all-chat/chatllm.cpp          |  5 +++++
 6 files changed, 29 insertions(+), 3 deletions(-)

diff --git a/gpt4all-backend/llamamodel.cpp b/gpt4all-backend/llamamodel.cpp
index 78061dcd83ab..b92f6e876229 100644
--- a/gpt4all-backend/llamamodel.cpp
+++ b/gpt4all-backend/llamamodel.cpp
@@ -337,6 +337,16 @@ bool LLamaModel::hasGPUDevice()
 #endif
 }
 
+bool LLamaModel::usingGPUDevice()
+{
+#if defined(GGML_USE_KOMPUTE)
+    return ggml_vk_using_vulkan();
+#elif defined(GGML_USE_METAL)
+    return true;
+#endif
+    return false;
+}
+
 #if defined(_WIN32)
 #define DLL_EXPORT __declspec(dllexport)
 #else
diff --git a/gpt4all-backend/llamamodel_impl.h b/gpt4all-backend/llamamodel_impl.h
index 08517dee2714..943437ee6eed 100644
--- a/gpt4all-backend/llamamodel_impl.h
+++ b/gpt4all-backend/llamamodel_impl.h
@@ -30,6 +30,7 @@ class LLamaModel : public LLModel {
     bool initializeGPUDevice(const GPUDevice &device) override;
     bool initializeGPUDevice(int device) override;
     bool hasGPUDevice() override;
+    bool usingGPUDevice() override;
 
 private:
     LLamaPrivate *d_ptr;
diff --git a/gpt4all-backend/llmodel.h b/gpt4all-backend/llmodel.h
index 0a61cea3a576..c90edd306048 100644
--- a/gpt4all-backend/llmodel.h
+++ b/gpt4all-backend/llmodel.h
@@ -100,6 +100,7 @@ class LLModel {
     virtual bool initializeGPUDevice(const GPUDevice &/*device*/) { return false; }
     virtual bool initializeGPUDevice(int /*device*/) { return false; }
     virtual bool hasGPUDevice() { return false; }
+    virtual bool usingGPUDevice() { return false; }
 
 protected:
     // These are pure virtual because subclasses need to implement as the default implementation of
diff --git a/gpt4all-backend/replit.cpp b/gpt4all-backend/replit.cpp
index b535b83e1ccc..196545e39c97 100644
--- a/gpt4all-backend/replit.cpp
+++ b/gpt4all-backend/replit.cpp
@@ -163,7 +163,7 @@ struct mpt_hparams {
     int32_t n_embd      = 0; //max_seq_len
     int32_t n_head      = 0; // n_heads
     int32_t n_layer     = 0; //n_layers
-    int32_t ftype       = 0; 
+    int32_t ftype       = 0;
 };
 
 struct replit_layer {
@@ -220,7 +220,7 @@ static bool kv_cache_init(
     params.mem_size   = cache.buf.size;
     params.mem_buffer = cache.buf.addr;
     params.no_alloc   = false;
-    
+
     cache.ctx = ggml_init(params);
     if (!cache.ctx) {
         fprintf(stderr, "%s: failed to allocate memory for kv cache\n", __func__);
@@ -503,7 +503,7 @@ bool replit_model_load(const std::string & fname, std::istream &fin, replit_mode
     }
 
     GGML_CHECK_BUF(ggml_metal_add_buffer(model.ctx_metal, "data", data_ptr, data_size, max_size));
-    GGML_CHECK_BUF(ggml_metal_add_buffer(model.ctx_metal, "kv", ggml_get_mem_buffer(model.kv_self.ctx), 
+    GGML_CHECK_BUF(ggml_metal_add_buffer(model.ctx_metal, "kv", ggml_get_mem_buffer(model.kv_self.ctx),
                                                                 ggml_get_mem_size(model.kv_self.ctx), 0));
     GGML_CHECK_BUF(ggml_metal_add_buffer(model.ctx_metal, "eval", model.eval_buf.addr, model.eval_buf.size, 0));
     GGML_CHECK_BUF(ggml_metal_add_buffer(model.ctx_metal, "scr0", model.scr0_buf.addr, model.scr0_buf.size, 0));
@@ -975,6 +975,14 @@ const std::vector<LLModel::Token> &Replit::endTokens() const
     return fres;
 }
 
+bool Replit::usingGPUDevice()
+{
+#if defined(GGML_USE_METAL)
+    return true;
+#endif
+    return false;
+}
+
 #if defined(_WIN32)
 #define DLL_EXPORT __declspec(dllexport)
 #else
diff --git a/gpt4all-backend/replit_impl.h b/gpt4all-backend/replit_impl.h
index f635f30dbab5..97e07fa9b582 100644
--- a/gpt4all-backend/replit_impl.h
+++ b/gpt4all-backend/replit_impl.h
@@ -27,6 +27,7 @@ class Replit : public LLModel {
     size_t restoreState(const uint8_t *src) override;
     void setThreadCount(int32_t n_threads) override;
     int32_t threadCount() const override;
+    bool usingGPUDevice() override;
 
 private:
     ReplitPrivate *d_ptr;
diff --git a/gpt4all-chat/chatllm.cpp b/gpt4all-chat/chatllm.cpp
index 0efc0c71fe4c..5d6ea6d3a352 100644
--- a/gpt4all-chat/chatllm.cpp
+++ b/gpt4all-chat/chatllm.cpp
@@ -302,6 +302,11 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo)
                     m_llModelInfo = LLModelInfo();
                     emit modelLoadingError(QString("Could not load model due to invalid model file for %1").arg(modelInfo.filename()));
                 } else {
+                    // We might have had to fallback to CPU after load if the model is not possible to accelerate
+                    // for instance if the quantization method is not supported on Vulkan yet
+                    if (actualDevice != "CPU" && !m_llModelInfo.model->usingGPUDevice())
+                        emit reportDevice("CPU");
+
                     switch (m_llModelInfo.model->implementation().modelType()[0]) {
                     case 'L': m_llModelType = LLModelType::LLAMA_; break;
                     case 'G': m_llModelType = LLModelType::GPTJ_; break;