init

Signed-off-by: YdrMaster <ydrml@hotmail.com>
InfiniTensor · Feb 8, 2024 · 4fab589 · 4fab589
commit 4fab589
Show file tree

Hide file tree

Showing 8 changed files with 267 additions and 0 deletions.
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -0,0 +1,52 @@
+# This workflow uses actions that are not certified by GitHub.
+# They are provided by a third-party and are governed by
+# separate terms of service, privacy policy, and support
+# documentation.
+# rust-clippy is a tool that runs a bunch of lints to catch common
+# mistakes in your Rust code and help improve your Rust code.
+# More details at https://github.com/rust-lang/rust-clippy
+# and https://rust-lang.github.io/rust-clippy/
+
+name: CI
+
+on:
+  push:
+    paths-ignore:
+      - '**.md'
+      - 'LICENSE'
+  pull_request:
+    paths:
+      - '**.md'
+      - 'LICENSE'
+
+jobs:
+  rust-clippy-analyze:
+    name: Run rust-clippy analyzing
+    runs-on: ubuntu-latest
+    permissions:
+      security-events: write
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+
+      - name: Check format
+        run: cargo fmt --check
+
+      - name: Run test
+        run: cargo test
+
+      - name: Install required cargo
+        run: cargo install clippy-sarif sarif-fmt
+
+      - name: Run rust-clippy
+        run:
+          cargo clippy
+          --all-features
+          --message-format=json | clippy-sarif | tee rust-clippy-results.sarif | sarif-fmt
+        continue-on-error: true
+
+      - name: Upload analysis results to GitHub
+        uses: github/codeql-action/upload-sarif@v2
+        with:
+          sarif_file: rust-clippy-results.sarif
+          wait-for-processing: true
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,6 @@
+**/.*/*
+!**/.cargo/*
+!/.github/*
+
+/target
+/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -0,0 +1,3 @@
+[workspace]
+members = ["tokenizer"]
+resolver = "2"
diff --git a/tokenizer/Cargo.toml b/tokenizer/Cargo.toml
@@ -0,0 +1,11 @@
+[package]
+name = "tokenizer"
+version = "0.0.0"
+edition = "2021"
+authors = ["YdrMaster <ydrml@hotmail.com>"]
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+patricia_tree = "0.8"
+memmap2 = "0.9"
diff --git a/tokenizer/src/bpe.rs b/tokenizer/src/bpe.rs
@@ -0,0 +1,73 @@
+use crate::{utok, Tokenizer};
+use std::path::Path;
+
+pub struct BPE {
+    mmap: memmap2::Mmap,
+    offsets: Vec<usize>,
+}
+
+impl BPE {
+    pub fn from_model(model_file: impl AsRef<Path>) -> Self {
+        let file = std::fs::File::open(model_file).unwrap();
+        let mmap = unsafe { memmap2::Mmap::map(&file) }.unwrap();
+        // format: 10 <total_len> 10 <str_len> <str;str_len> 21 <score;4> []
+        let mut offsets = Vec::new();
+        let mut offset = 0usize;
+        loop {
+            let slice = &mmap[offset..];
+            if slice.is_empty() || slice[0] != 10 {
+                break;
+            }
+            offsets.push(offset + 3);
+            offset += 2 + slice[1] as usize;
+        }
+        Self { mmap, offsets }
+    }
+
+    #[inline]
+    fn get_piece(&self, i: utok) -> &str {
+        let offset = self.offsets[i as usize];
+        let slice = &self.mmap[offset..];
+        let len = slice[0] as usize;
+        std::str::from_utf8(&slice[1..][..len]).unwrap()
+    }
+
+    #[inline]
+    fn get_score(&self, i: utok) -> f32 {
+        let offset = self.offsets[i as usize];
+        let slice = &self.mmap[offset..];
+        let len = slice[0] as usize;
+        let ptr = slice[len + 2..].as_ptr().cast::<f32>();
+        unsafe { ptr.read_unaligned() }
+    }
+}
+
+impl Tokenizer for BPE {
+    fn bos(&self) -> crate::utok {
+        todo!()
+    }
+
+    fn eos(&self) -> crate::utok {
+        todo!()
+    }
+
+    fn max_piece_len(&self) -> usize {
+        todo!()
+    }
+
+    fn encode(&self, text: &str, bos: bool, eos: bool) -> Vec<crate::utok> {
+        todo!()
+    }
+
+    fn decode(&self, token: crate::utok, next: crate::utok) -> &str {
+        todo!()
+    }
+}
+
+#[test]
+fn read_tokenizer() {
+    let bpe = BPE::from_model("tokenizer.model");
+    for i in 0..bpe.offsets.len() {
+        println!("{}: {}", bpe.get_piece(i as utok), bpe.get_score(i as utok));
+    }
+}
diff --git a/tokenizer/src/lib.rs b/tokenizer/src/lib.rs
@@ -0,0 +1,17 @@
+mod bpe;
+mod vocab_txt;
+
+/// `utok` for token id.
+#[allow(non_camel_case_types)]
+pub type utok = u32;
+
+pub trait Tokenizer {
+    fn bos(&self) -> utok;
+    fn eos(&self) -> utok;
+    fn max_piece_len(&self) -> usize;
+    fn encode(&self, text: &str, bos: bool, eos: bool) -> Vec<utok>;
+    fn decode(&self, token: utok, next: utok) -> &str;
+}
+
+pub use bpe::BPE;
+pub use vocab_txt::VocabTxt;
diff --git a/tokenizer/src/vocab_txt.rs b/tokenizer/src/vocab_txt.rs
@@ -0,0 +1,105 @@
+use super::{utok, Tokenizer};
+use memmap2::Mmap;
+use patricia_tree::PatriciaMap;
+use std::{fs::File, path::Path};
+
+/// 一个基于朴素词表的分词器。
+pub struct VocabTxt {
+    /// 词表。
+    words: Vec<String>,
+    /// 词汇的前缀树。
+    trie: PatriciaMap<utok>,
+    /// 词汇的最大长度。
+    max_piece_len: usize,
+    /// 单字节词汇转义。
+    byte_pieces: [u8; 256],
+}
+
+impl VocabTxt {
+    pub fn new(tokenizer: impl AsRef<Path>) -> Self {
+        let mmap = unsafe { Mmap::map(&File::open(tokenizer).unwrap()) }.unwrap();
+        let text = unsafe { std::str::from_utf8_unchecked(&mmap) };
+
+        let mut words = Vec::new();
+        let mut trie = PatriciaMap::new();
+        let mut max_piece_len = 0;
+        for (i, line) in text.lines().into_iter().enumerate() {
+            let piece = line.strip_prefix('"').unwrap().strip_suffix('"').unwrap();
+            max_piece_len = max_piece_len.max(piece.len());
+            words.push(piece.to_string());
+            trie.insert(piece, i as _);
+        }
+        let mut ans = Self {
+            words,
+            trie,
+            max_piece_len,
+            byte_pieces: [0; 256],
+        };
+        for i in 0..=255u8 {
+            ans.byte_pieces[i as usize] = i;
+        }
+        ans
+    }
+}
+
+impl Tokenizer for VocabTxt {
+    #[inline]
+    fn bos(&self) -> utok {
+        1
+    }
+
+    #[inline]
+    fn eos(&self) -> utok {
+        2
+    }
+
+    #[inline]
+    fn max_piece_len(&self) -> usize {
+        self.max_piece_len
+    }
+
+    fn encode(&self, mut text: &str, bos: bool, eos: bool) -> Vec<utok> {
+        let mut tokens = Vec::<utok>::new();
+        if bos {
+            tokens.push(self.bos());
+        }
+
+        while !text.is_empty() {
+            let piece = if text.len() > self.max_piece_len {
+                &text[..self.max_piece_len]
+            } else {
+                text
+            };
+            if let Some((pre, tok)) = self.trie.get_longest_common_prefix(piece) {
+                tokens.push(*tok);
+                text = &text[pre.len()..];
+            } else {
+                let mut chars = text.chars();
+                let char = chars.next().unwrap();
+                tokens.extend(char.to_string().bytes().map(|b| (b + 3) as utok));
+                text = chars.as_str();
+            }
+        }
+
+        if bos {
+            assert_eq!(tokens[0], self.bos());
+        }
+        if eos {
+            tokens.push(self.eos());
+        }
+        tokens
+    }
+
+    fn decode(&self, token: utok, next: utok) -> &str {
+        let piece = self.words[next as usize].as_str();
+        if let Some(byte) = piece.strip_prefix("<0x").and_then(|s| s.strip_suffix('>')) {
+            let byte = u8::from_str_radix(byte, 16).unwrap();
+            let byte = &self.byte_pieces[byte as usize..][..1];
+            unsafe { std::str::from_utf8_unchecked(byte) }
+        } else if token == self.bos() && piece.starts_with(' ') {
+            &piece[1..]
+        } else {
+            piece
+        }
+    }
+}
diff --git a/tokenizer/tokenizer.model b/tokenizer/tokenizer.model