From 3076e0bf2661adba4533ddb6f676b5a8123351fa Mon Sep 17 00:00:00 2001 From: Adam Treat Date: Thu, 14 Sep 2023 09:59:19 -0400 Subject: [PATCH] Only show GPU when we're actually using it. --- gpt4all-backend/llamamodel.cpp | 10 ++++++++++ gpt4all-backend/llamamodel_impl.h | 1 + gpt4all-backend/llmodel.h | 1 + gpt4all-backend/replit.cpp | 14 +++++++++++--- gpt4all-backend/replit_impl.h | 1 + gpt4all-chat/chatllm.cpp | 5 +++++ 6 files changed, 29 insertions(+), 3 deletions(-) diff --git a/gpt4all-backend/llamamodel.cpp b/gpt4all-backend/llamamodel.cpp index 78061dcd83ab..b92f6e876229 100644 --- a/gpt4all-backend/llamamodel.cpp +++ b/gpt4all-backend/llamamodel.cpp @@ -337,6 +337,16 @@ bool LLamaModel::hasGPUDevice() #endif } +bool LLamaModel::usingGPUDevice() +{ +#if defined(GGML_USE_KOMPUTE) + return ggml_vk_using_vulkan(); +#elif defined(GGML_USE_METAL) + return true; +#endif + return false; +} + #if defined(_WIN32) #define DLL_EXPORT __declspec(dllexport) #else diff --git a/gpt4all-backend/llamamodel_impl.h b/gpt4all-backend/llamamodel_impl.h index 08517dee2714..943437ee6eed 100644 --- a/gpt4all-backend/llamamodel_impl.h +++ b/gpt4all-backend/llamamodel_impl.h @@ -30,6 +30,7 @@ class LLamaModel : public LLModel { bool initializeGPUDevice(const GPUDevice &device) override; bool initializeGPUDevice(int device) override; bool hasGPUDevice() override; + bool usingGPUDevice() override; private: LLamaPrivate *d_ptr; diff --git a/gpt4all-backend/llmodel.h b/gpt4all-backend/llmodel.h index 0a61cea3a576..c90edd306048 100644 --- a/gpt4all-backend/llmodel.h +++ b/gpt4all-backend/llmodel.h @@ -100,6 +100,7 @@ class LLModel { virtual bool initializeGPUDevice(const GPUDevice &/*device*/) { return false; } virtual bool initializeGPUDevice(int /*device*/) { return false; } virtual bool hasGPUDevice() { return false; } + virtual bool usingGPUDevice() { return false; } protected: // These are pure virtual because subclasses need to implement as the default implementation of diff --git a/gpt4all-backend/replit.cpp b/gpt4all-backend/replit.cpp index b535b83e1ccc..196545e39c97 100644 --- a/gpt4all-backend/replit.cpp +++ b/gpt4all-backend/replit.cpp @@ -163,7 +163,7 @@ struct mpt_hparams { int32_t n_embd = 0; //max_seq_len int32_t n_head = 0; // n_heads int32_t n_layer = 0; //n_layers - int32_t ftype = 0; + int32_t ftype = 0; }; struct replit_layer { @@ -220,7 +220,7 @@ static bool kv_cache_init( params.mem_size = cache.buf.size; params.mem_buffer = cache.buf.addr; params.no_alloc = false; - + cache.ctx = ggml_init(params); if (!cache.ctx) { fprintf(stderr, "%s: failed to allocate memory for kv cache\n", __func__); @@ -503,7 +503,7 @@ bool replit_model_load(const std::string & fname, std::istream &fin, replit_mode } GGML_CHECK_BUF(ggml_metal_add_buffer(model.ctx_metal, "data", data_ptr, data_size, max_size)); - GGML_CHECK_BUF(ggml_metal_add_buffer(model.ctx_metal, "kv", ggml_get_mem_buffer(model.kv_self.ctx), + GGML_CHECK_BUF(ggml_metal_add_buffer(model.ctx_metal, "kv", ggml_get_mem_buffer(model.kv_self.ctx), ggml_get_mem_size(model.kv_self.ctx), 0)); GGML_CHECK_BUF(ggml_metal_add_buffer(model.ctx_metal, "eval", model.eval_buf.addr, model.eval_buf.size, 0)); GGML_CHECK_BUF(ggml_metal_add_buffer(model.ctx_metal, "scr0", model.scr0_buf.addr, model.scr0_buf.size, 0)); @@ -975,6 +975,14 @@ const std::vector &Replit::endTokens() const return fres; } +bool Replit::usingGPUDevice() +{ +#if defined(GGML_USE_METAL) + return true; +#endif + return false; +} + #if defined(_WIN32) #define DLL_EXPORT __declspec(dllexport) #else diff --git a/gpt4all-backend/replit_impl.h b/gpt4all-backend/replit_impl.h index f635f30dbab5..97e07fa9b582 100644 --- a/gpt4all-backend/replit_impl.h +++ b/gpt4all-backend/replit_impl.h @@ -27,6 +27,7 @@ class Replit : public LLModel { size_t restoreState(const uint8_t *src) override; void setThreadCount(int32_t n_threads) override; int32_t threadCount() const override; + bool usingGPUDevice() override; private: ReplitPrivate *d_ptr; diff --git a/gpt4all-chat/chatllm.cpp b/gpt4all-chat/chatllm.cpp index 0efc0c71fe4c..5d6ea6d3a352 100644 --- a/gpt4all-chat/chatllm.cpp +++ b/gpt4all-chat/chatllm.cpp @@ -302,6 +302,11 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo) m_llModelInfo = LLModelInfo(); emit modelLoadingError(QString("Could not load model due to invalid model file for %1").arg(modelInfo.filename())); } else { + // We might have had to fallback to CPU after load if the model is not possible to accelerate + // for instance if the quantization method is not supported on Vulkan yet + if (actualDevice != "CPU" && !m_llModelInfo.model->usingGPUDevice()) + emit reportDevice("CPU"); + switch (m_llModelInfo.model->implementation().modelType()[0]) { case 'L': m_llModelType = LLModelType::LLAMA_; break; case 'G': m_llModelType = LLModelType::GPTJ_; break;