-
Notifications
You must be signed in to change notification settings - Fork 27
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Signed-off-by: YdrMaster <ydrml@hotmail.com>
- Loading branch information
0 parents
commit 4fab589
Showing
8 changed files
with
267 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
# This workflow uses actions that are not certified by GitHub. | ||
# They are provided by a third-party and are governed by | ||
# separate terms of service, privacy policy, and support | ||
# documentation. | ||
# rust-clippy is a tool that runs a bunch of lints to catch common | ||
# mistakes in your Rust code and help improve your Rust code. | ||
# More details at https://github.com/rust-lang/rust-clippy | ||
# and https://rust-lang.github.io/rust-clippy/ | ||
|
||
name: CI | ||
|
||
on: | ||
push: | ||
paths-ignore: | ||
- '**.md' | ||
- 'LICENSE' | ||
pull_request: | ||
paths: | ||
- '**.md' | ||
- 'LICENSE' | ||
|
||
jobs: | ||
rust-clippy-analyze: | ||
name: Run rust-clippy analyzing | ||
runs-on: ubuntu-latest | ||
permissions: | ||
security-events: write | ||
steps: | ||
- name: Checkout code | ||
uses: actions/checkout@v3 | ||
|
||
- name: Check format | ||
run: cargo fmt --check | ||
|
||
- name: Run test | ||
run: cargo test | ||
|
||
- name: Install required cargo | ||
run: cargo install clippy-sarif sarif-fmt | ||
|
||
- name: Run rust-clippy | ||
run: | ||
cargo clippy | ||
--all-features | ||
--message-format=json | clippy-sarif | tee rust-clippy-results.sarif | sarif-fmt | ||
continue-on-error: true | ||
|
||
- name: Upload analysis results to GitHub | ||
uses: github/codeql-action/upload-sarif@v2 | ||
with: | ||
sarif_file: rust-clippy-results.sarif | ||
wait-for-processing: true |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
**/.*/* | ||
!**/.cargo/* | ||
!/.github/* | ||
|
||
/target | ||
/Cargo.lock |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
[workspace] | ||
members = ["tokenizer"] | ||
resolver = "2" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
[package] | ||
name = "tokenizer" | ||
version = "0.0.0" | ||
edition = "2021" | ||
authors = ["YdrMaster <ydrml@hotmail.com>"] | ||
|
||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html | ||
|
||
[dependencies] | ||
patricia_tree = "0.8" | ||
memmap2 = "0.9" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
use crate::{utok, Tokenizer}; | ||
use std::path::Path; | ||
|
||
pub struct BPE { | ||
mmap: memmap2::Mmap, | ||
offsets: Vec<usize>, | ||
} | ||
|
||
impl BPE { | ||
pub fn from_model(model_file: impl AsRef<Path>) -> Self { | ||
let file = std::fs::File::open(model_file).unwrap(); | ||
let mmap = unsafe { memmap2::Mmap::map(&file) }.unwrap(); | ||
// format: 10 <total_len> 10 <str_len> <str;str_len> 21 <score;4> [] | ||
let mut offsets = Vec::new(); | ||
let mut offset = 0usize; | ||
loop { | ||
let slice = &mmap[offset..]; | ||
if slice.is_empty() || slice[0] != 10 { | ||
break; | ||
} | ||
offsets.push(offset + 3); | ||
offset += 2 + slice[1] as usize; | ||
} | ||
Self { mmap, offsets } | ||
} | ||
|
||
#[inline] | ||
fn get_piece(&self, i: utok) -> &str { | ||
let offset = self.offsets[i as usize]; | ||
let slice = &self.mmap[offset..]; | ||
let len = slice[0] as usize; | ||
std::str::from_utf8(&slice[1..][..len]).unwrap() | ||
} | ||
|
||
#[inline] | ||
fn get_score(&self, i: utok) -> f32 { | ||
let offset = self.offsets[i as usize]; | ||
let slice = &self.mmap[offset..]; | ||
let len = slice[0] as usize; | ||
let ptr = slice[len + 2..].as_ptr().cast::<f32>(); | ||
unsafe { ptr.read_unaligned() } | ||
} | ||
} | ||
|
||
impl Tokenizer for BPE { | ||
fn bos(&self) -> crate::utok { | ||
todo!() | ||
} | ||
|
||
fn eos(&self) -> crate::utok { | ||
todo!() | ||
} | ||
|
||
fn max_piece_len(&self) -> usize { | ||
todo!() | ||
} | ||
|
||
fn encode(&self, text: &str, bos: bool, eos: bool) -> Vec<crate::utok> { | ||
todo!() | ||
} | ||
|
||
fn decode(&self, token: crate::utok, next: crate::utok) -> &str { | ||
todo!() | ||
} | ||
} | ||
|
||
#[test] | ||
fn read_tokenizer() { | ||
let bpe = BPE::from_model("tokenizer.model"); | ||
for i in 0..bpe.offsets.len() { | ||
println!("{}: {}", bpe.get_piece(i as utok), bpe.get_score(i as utok)); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
mod bpe; | ||
mod vocab_txt; | ||
|
||
/// `utok` for token id. | ||
#[allow(non_camel_case_types)] | ||
pub type utok = u32; | ||
|
||
pub trait Tokenizer { | ||
fn bos(&self) -> utok; | ||
fn eos(&self) -> utok; | ||
fn max_piece_len(&self) -> usize; | ||
fn encode(&self, text: &str, bos: bool, eos: bool) -> Vec<utok>; | ||
fn decode(&self, token: utok, next: utok) -> &str; | ||
} | ||
|
||
pub use bpe::BPE; | ||
pub use vocab_txt::VocabTxt; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
use super::{utok, Tokenizer}; | ||
use memmap2::Mmap; | ||
use patricia_tree::PatriciaMap; | ||
use std::{fs::File, path::Path}; | ||
|
||
/// 一个基于朴素词表的分词器。 | ||
pub struct VocabTxt { | ||
/// 词表。 | ||
words: Vec<String>, | ||
/// 词汇的前缀树。 | ||
trie: PatriciaMap<utok>, | ||
/// 词汇的最大长度。 | ||
max_piece_len: usize, | ||
/// 单字节词汇转义。 | ||
byte_pieces: [u8; 256], | ||
} | ||
|
||
impl VocabTxt { | ||
pub fn new(tokenizer: impl AsRef<Path>) -> Self { | ||
let mmap = unsafe { Mmap::map(&File::open(tokenizer).unwrap()) }.unwrap(); | ||
let text = unsafe { std::str::from_utf8_unchecked(&mmap) }; | ||
|
||
let mut words = Vec::new(); | ||
let mut trie = PatriciaMap::new(); | ||
let mut max_piece_len = 0; | ||
for (i, line) in text.lines().into_iter().enumerate() { | ||
let piece = line.strip_prefix('"').unwrap().strip_suffix('"').unwrap(); | ||
max_piece_len = max_piece_len.max(piece.len()); | ||
words.push(piece.to_string()); | ||
trie.insert(piece, i as _); | ||
} | ||
let mut ans = Self { | ||
words, | ||
trie, | ||
max_piece_len, | ||
byte_pieces: [0; 256], | ||
}; | ||
for i in 0..=255u8 { | ||
ans.byte_pieces[i as usize] = i; | ||
} | ||
ans | ||
} | ||
} | ||
|
||
impl Tokenizer for VocabTxt { | ||
#[inline] | ||
fn bos(&self) -> utok { | ||
1 | ||
} | ||
|
||
#[inline] | ||
fn eos(&self) -> utok { | ||
2 | ||
} | ||
|
||
#[inline] | ||
fn max_piece_len(&self) -> usize { | ||
self.max_piece_len | ||
} | ||
|
||
fn encode(&self, mut text: &str, bos: bool, eos: bool) -> Vec<utok> { | ||
let mut tokens = Vec::<utok>::new(); | ||
if bos { | ||
tokens.push(self.bos()); | ||
} | ||
|
||
while !text.is_empty() { | ||
let piece = if text.len() > self.max_piece_len { | ||
&text[..self.max_piece_len] | ||
} else { | ||
text | ||
}; | ||
if let Some((pre, tok)) = self.trie.get_longest_common_prefix(piece) { | ||
tokens.push(*tok); | ||
text = &text[pre.len()..]; | ||
} else { | ||
let mut chars = text.chars(); | ||
let char = chars.next().unwrap(); | ||
tokens.extend(char.to_string().bytes().map(|b| (b + 3) as utok)); | ||
text = chars.as_str(); | ||
} | ||
} | ||
|
||
if bos { | ||
assert_eq!(tokens[0], self.bos()); | ||
} | ||
if eos { | ||
tokens.push(self.eos()); | ||
} | ||
tokens | ||
} | ||
|
||
fn decode(&self, token: utok, next: utok) -> &str { | ||
let piece = self.words[next as usize].as_str(); | ||
if let Some(byte) = piece.strip_prefix("<0x").and_then(|s| s.strip_suffix('>')) { | ||
let byte = u8::from_str_radix(byte, 16).unwrap(); | ||
let byte = &self.byte_pieces[byte as usize..][..1]; | ||
unsafe { std::str::from_utf8_unchecked(byte) } | ||
} else if token == self.bos() && piece.starts_with(' ') { | ||
&piece[1..] | ||
} else { | ||
piece | ||
} | ||
} | ||
} |
Binary file not shown.