From 4447a275180ad6a1747879885e4619a9b675c033 Mon Sep 17 00:00:00 2001 From: "fritz.f.yan" Date: Thu, 6 Jun 2024 08:44:35 +0800 Subject: [PATCH] fix: emoji and unicode decoding bug Signed-off-by: fritz.f.yan --- gpt4all-backend/llmodel_shared.cpp | 40 +++++++++++++++++++++++++++--- 1 file changed, 36 insertions(+), 4 deletions(-) diff --git a/gpt4all-backend/llmodel_shared.cpp b/gpt4all-backend/llmodel_shared.cpp index 1296dc4ea05b..7552d1620bad 100644 --- a/gpt4all-backend/llmodel_shared.cpp +++ b/gpt4all-backend/llmodel_shared.cpp @@ -207,6 +207,20 @@ void LLModel::decodePrompt(std::function promptCallback, } } +int getCharSize(unsigned char current){ + int charSize = 0; + if (current < 0x80) { + charSize = 1; + } else if ((current & 0xE0) == 0xC0) { + charSize = 2; + } else if ((current & 0xF0) == 0xE0) { + charSize = 3; + } else if ((current & 0xF8) == 0xF0) { + charSize = 4; + } + return charSize; +} + void LLModel::generateResponse(std::function responseCallback, std::function recalculateCallback, PromptContext &promptCtx) { @@ -215,6 +229,7 @@ void LLModel::generateResponse(std::function std::unordered_set reversePrompts = { "### Instruction", "### Prompt", "### Response", "### Human", "### Assistant", "### Context" }; + std::string cache = ""; // predict next tokens for (int i = 0; i < promptCtx.n_predict; i++) { @@ -263,13 +278,30 @@ void LLModel::generateResponse(std::function // Empty the cache for (auto t : cachedTokens) { - if (int32_t(promptCtx.tokens.size()) == promptCtx.n_ctx) + if (int32_t(promptCtx.tokens.size()) == promptCtx.n_ctx){ promptCtx.tokens.erase(promptCtx.tokens.begin()); + } promptCtx.tokens.push_back(t); promptCtx.n_past += 1; - //TODO: Conversion to std::string can be avoided here... - if (!responseCallback(t, std::string(tokenToString(t)))) - return; + + //fix: Emoji Bug + std::string batch_str = ""; + cache += tokenToString(t); + while (!cache.empty()) { + int cached_size = getCharSize(static_cast(cache[0])); + if (cached_size == 0 || cached_size > cache.length()) { + break; // If the size is invalid or larger than the remaining cache, break out + } + batch_str += cache.substr(0, cached_size); + cache = cache.substr(cached_size); + } + + if (!batch_str.empty()) { + //TODO: Conversion to std::string can be avoided here... + if (!responseCallback(t, batch_str)) { + return; + } + } } cachedTokens.clear(); }