LLaMa: get tokens in the range of the vocab size (#1621)

Co-authored-by: thucpham <minhthuc.pham@systrangroup.com>
OpenNMT · Feb 12, 2024 · 8e82733 · 8e82733
1 parent ce47032
commit 8e82733
Showing 1 changed file with 2 additions and 0 deletions.
diff --git a/python/ctranslate2/converters/transformers.py b/python/ctranslate2/converters/transformers.py
@@ -1331,6 +1331,8 @@ def get_vocabulary(self, model, tokenizer):
         extra_ids = model.config.vocab_size - len(tokens)
         for i in range(extra_ids):
             tokens.append("<extra_id_%d>" % i)
+        if model.config.vocab_size < len(tokens):
+            tokens = tokens[: model.config.vocab_size]
 
         return tokens