Skip to content

Commit

Permalink
init
Browse files Browse the repository at this point in the history
Signed-off-by: YdrMaster <ydrml@hotmail.com>
  • Loading branch information
YdrMaster committed Feb 8, 2024
0 parents commit 4fab589
Show file tree
Hide file tree
Showing 8 changed files with 267 additions and 0 deletions.
52 changes: 52 additions & 0 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# This workflow uses actions that are not certified by GitHub.
# They are provided by a third-party and are governed by
# separate terms of service, privacy policy, and support
# documentation.
# rust-clippy is a tool that runs a bunch of lints to catch common
# mistakes in your Rust code and help improve your Rust code.
# More details at https://github.com/rust-lang/rust-clippy
# and https://rust-lang.github.io/rust-clippy/

name: CI

on:
push:
paths-ignore:
- '**.md'
- 'LICENSE'
pull_request:
paths:
- '**.md'
- 'LICENSE'

jobs:
rust-clippy-analyze:
name: Run rust-clippy analyzing
runs-on: ubuntu-latest
permissions:
security-events: write
steps:
- name: Checkout code
uses: actions/checkout@v3

- name: Check format
run: cargo fmt --check

- name: Run test
run: cargo test

- name: Install required cargo
run: cargo install clippy-sarif sarif-fmt

- name: Run rust-clippy
run:
cargo clippy
--all-features
--message-format=json | clippy-sarif | tee rust-clippy-results.sarif | sarif-fmt
continue-on-error: true

- name: Upload analysis results to GitHub
uses: github/codeql-action/upload-sarif@v2
with:
sarif_file: rust-clippy-results.sarif
wait-for-processing: true
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
**/.*/*
!**/.cargo/*
!/.github/*

/target
/Cargo.lock
3 changes: 3 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[workspace]
members = ["tokenizer"]
resolver = "2"
11 changes: 11 additions & 0 deletions tokenizer/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
[package]
name = "tokenizer"
version = "0.0.0"
edition = "2021"
authors = ["YdrMaster <ydrml@hotmail.com>"]

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[dependencies]
patricia_tree = "0.8"
memmap2 = "0.9"
73 changes: 73 additions & 0 deletions tokenizer/src/bpe.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
use crate::{utok, Tokenizer};
use std::path::Path;

pub struct BPE {
mmap: memmap2::Mmap,
offsets: Vec<usize>,
}

impl BPE {
pub fn from_model(model_file: impl AsRef<Path>) -> Self {
let file = std::fs::File::open(model_file).unwrap();
let mmap = unsafe { memmap2::Mmap::map(&file) }.unwrap();
// format: 10 <total_len> 10 <str_len> <str;str_len> 21 <score;4> []
let mut offsets = Vec::new();
let mut offset = 0usize;
loop {
let slice = &mmap[offset..];
if slice.is_empty() || slice[0] != 10 {
break;
}
offsets.push(offset + 3);
offset += 2 + slice[1] as usize;
}
Self { mmap, offsets }
}

#[inline]
fn get_piece(&self, i: utok) -> &str {
let offset = self.offsets[i as usize];
let slice = &self.mmap[offset..];
let len = slice[0] as usize;
std::str::from_utf8(&slice[1..][..len]).unwrap()
}

#[inline]
fn get_score(&self, i: utok) -> f32 {
let offset = self.offsets[i as usize];
let slice = &self.mmap[offset..];
let len = slice[0] as usize;
let ptr = slice[len + 2..].as_ptr().cast::<f32>();
unsafe { ptr.read_unaligned() }
}
}

impl Tokenizer for BPE {
fn bos(&self) -> crate::utok {
todo!()
}

fn eos(&self) -> crate::utok {
todo!()
}

fn max_piece_len(&self) -> usize {
todo!()
}

fn encode(&self, text: &str, bos: bool, eos: bool) -> Vec<crate::utok> {
todo!()
}

fn decode(&self, token: crate::utok, next: crate::utok) -> &str {
todo!()
}
}

#[test]
fn read_tokenizer() {
let bpe = BPE::from_model("tokenizer.model");
for i in 0..bpe.offsets.len() {
println!("{}: {}", bpe.get_piece(i as utok), bpe.get_score(i as utok));
}
}
17 changes: 17 additions & 0 deletions tokenizer/src/lib.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
mod bpe;
mod vocab_txt;

/// `utok` for token id.
#[allow(non_camel_case_types)]
pub type utok = u32;

pub trait Tokenizer {
fn bos(&self) -> utok;
fn eos(&self) -> utok;
fn max_piece_len(&self) -> usize;
fn encode(&self, text: &str, bos: bool, eos: bool) -> Vec<utok>;
fn decode(&self, token: utok, next: utok) -> &str;
}

pub use bpe::BPE;
pub use vocab_txt::VocabTxt;
105 changes: 105 additions & 0 deletions tokenizer/src/vocab_txt.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
use super::{utok, Tokenizer};
use memmap2::Mmap;
use patricia_tree::PatriciaMap;
use std::{fs::File, path::Path};

/// 一个基于朴素词表的分词器。
pub struct VocabTxt {
/// 词表。
words: Vec<String>,
/// 词汇的前缀树。
trie: PatriciaMap<utok>,
/// 词汇的最大长度。
max_piece_len: usize,
/// 单字节词汇转义。
byte_pieces: [u8; 256],
}

impl VocabTxt {
pub fn new(tokenizer: impl AsRef<Path>) -> Self {
let mmap = unsafe { Mmap::map(&File::open(tokenizer).unwrap()) }.unwrap();
let text = unsafe { std::str::from_utf8_unchecked(&mmap) };

let mut words = Vec::new();
let mut trie = PatriciaMap::new();
let mut max_piece_len = 0;
for (i, line) in text.lines().into_iter().enumerate() {
let piece = line.strip_prefix('"').unwrap().strip_suffix('"').unwrap();
max_piece_len = max_piece_len.max(piece.len());
words.push(piece.to_string());
trie.insert(piece, i as _);
}
let mut ans = Self {
words,
trie,
max_piece_len,
byte_pieces: [0; 256],
};
for i in 0..=255u8 {
ans.byte_pieces[i as usize] = i;
}
ans
}
}

impl Tokenizer for VocabTxt {
#[inline]
fn bos(&self) -> utok {
1
}

#[inline]
fn eos(&self) -> utok {
2
}

#[inline]
fn max_piece_len(&self) -> usize {
self.max_piece_len
}

fn encode(&self, mut text: &str, bos: bool, eos: bool) -> Vec<utok> {
let mut tokens = Vec::<utok>::new();
if bos {
tokens.push(self.bos());
}

while !text.is_empty() {
let piece = if text.len() > self.max_piece_len {
&text[..self.max_piece_len]
} else {
text
};
if let Some((pre, tok)) = self.trie.get_longest_common_prefix(piece) {
tokens.push(*tok);
text = &text[pre.len()..];
} else {
let mut chars = text.chars();
let char = chars.next().unwrap();
tokens.extend(char.to_string().bytes().map(|b| (b + 3) as utok));
text = chars.as_str();
}
}

if bos {
assert_eq!(tokens[0], self.bos());
}
if eos {
tokens.push(self.eos());
}
tokens
}

fn decode(&self, token: utok, next: utok) -> &str {
let piece = self.words[next as usize].as_str();
if let Some(byte) = piece.strip_prefix("<0x").and_then(|s| s.strip_suffix('>')) {
let byte = u8::from_str_radix(byte, 16).unwrap();
let byte = &self.byte_pieces[byte as usize..][..1];
unsafe { std::str::from_utf8_unchecked(byte) }
} else if token == self.bos() && piece.starts_with(' ') {
&piece[1..]
} else {
piece
}
}
}
Binary file added tokenizer/tokenizer.model
Binary file not shown.

0 comments on commit 4fab589

Please sign in to comment.