Skip to content

Commit

Permalink
fix(gguf): 修复以 char 为单位替换字符与 byte level 不兼容的问题
Browse files Browse the repository at this point in the history
Signed-off-by: YdrMaster <ydrml@hotmail.com>
  • Loading branch information
YdrMaster committed Jan 20, 2025
1 parent ccf165a commit 8729b3d
Showing 1 changed file with 14 additions and 10 deletions.
24 changes: 14 additions & 10 deletions gguf/src/tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,14 +45,18 @@ impl Tokenizer {

pub fn decode(&self, token: utok) -> Cow<str> {
let piece = self.tokenize.decode(token);
let ans = piece
.chars()
.map(|c| *self.de_replace.get(&c).unwrap_or(&c))
.collect::<String>();
if ans == piece {
piece.into()
if let Ok(piece) = from_utf8(piece) {
let ans = piece
.chars()
.map(|c| *self.de_replace.get(&c).unwrap_or(&c))
.collect::<String>();
if ans == piece {
piece.into()
} else {
ans.into()
}
} else {
ans.into()
unsafe { from_utf8_unchecked(piece) }.into()
}
}

Expand Down Expand Up @@ -133,7 +137,7 @@ trait Tokenize {
/// Encode a text into a sequence of tokens.
fn encode(&self, text: &str) -> Vec<utok>;
/// Decode a token into str.
fn decode(&self, token: utok) -> &str;
fn decode(&self, token: utok) -> &[u8];
}

impl<M: tokeneer::Method> Tokenize for Tokeneer<M> {
Expand All @@ -142,8 +146,8 @@ impl<M: tokeneer::Method> Tokenize for Tokeneer<M> {
self.encode(text)
}
#[inline]
fn decode(&self, token: utok) -> &str {
unsafe { from_utf8_unchecked(self.internal().decode(token)) }
fn decode(&self, token: utok) -> &[u8] {
self.internal().decode(token)
}
}

Expand Down

0 comments on commit 8729b3d

Please sign in to comment.