From 129ef14d232d17d77463001f3c505c6df54f550d Mon Sep 17 00:00:00 2001 From: YdrMaster Date: Fri, 22 Nov 2024 17:55:04 +0800 Subject: [PATCH] =?UTF-8?q?feat(clip):=20=E6=B7=BB=E5=8A=A0=20clip-cpu?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: YdrMaster --- Cargo.toml | 1 + models/clip/common-cpu/Cargo.toml | 15 +++++++++ models/clip/common-cpu/src/lib.rs | 2 ++ models/clip/common-cpu/src/test_infer.rs | 40 ++++++++++++++++++++++++ models/clip/common/src/image.rs | 2 +- models/clip/common/src/lib.rs | 14 +++++++-- models/clip/common/src/storage.rs | 40 +++++++++++++++++++----- models/llama/common/src/storage.rs | 6 ++-- 8 files changed, 106 insertions(+), 14 deletions(-) create mode 100644 models/clip/common-cpu/Cargo.toml create mode 100644 models/clip/common-cpu/src/lib.rs create mode 100644 models/clip/common-cpu/src/test_infer.rs diff --git a/Cargo.toml b/Cargo.toml index ebd4cca7..4a51e8da 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,6 +12,7 @@ members = [ "models/llama/ascend", "models/clip/common", + "models/clip/common-cpu", ] resolver = "2" diff --git a/models/clip/common-cpu/Cargo.toml b/models/clip/common-cpu/Cargo.toml new file mode 100644 index 00000000..d2bfa2f5 --- /dev/null +++ b/models/clip/common-cpu/Cargo.toml @@ -0,0 +1,15 @@ +[package] +name = "clip-cpu" +version = "0.0.0" +edition = "2021" +authors = ["YdrMaster "] + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +clip.path = "../common" +operators = { workspace = true, features = ["common-cpu"] } + +[dev-dependencies] +test-utils.workspace = true +gguf.workspace = true diff --git a/models/clip/common-cpu/src/lib.rs b/models/clip/common-cpu/src/lib.rs new file mode 100644 index 00000000..51b66777 --- /dev/null +++ b/models/clip/common-cpu/src/lib.rs @@ -0,0 +1,2 @@ +#[cfg(test)] +mod test_infer; diff --git a/models/clip/common-cpu/src/test_infer.rs b/models/clip/common-cpu/src/test_infer.rs new file mode 100644 index 00000000..471df6c0 --- /dev/null +++ b/models/clip/common-cpu/src/test_infer.rs @@ -0,0 +1,40 @@ +use clip::{ClipMeta, ClipStorage, Image}; +use gguf::GGufModel; +use std::time::Instant; +use test_utils::Inference; + +#[test] +fn test_infer() { + let Some(Inference { model, .. }) = Inference::load() else { + return; + }; + let Some(picture) = test_utils::image() else { + return; + }; + + let gguf = GGufModel::read(model.iter().map(|s| &**s)); + let storage = ClipStorage::from_gguf(&gguf); + let meta = &storage.meta; + println!("{meta:#?}"); + + let &ClipMeta { + dt_embd, + + d_image, + d_patch, + + image_mean, + image_std, + .. + } = meta; + + let time = Instant::now(); + let image = Image::load(picture); + println!("load image {:?}", time.elapsed()); + + let time = Instant::now(); + let _slices = image + .slice_uhd(9, d_image, d_patch) + .normalize(dt_embd, image_mean, image_std); + println!("slice image {:?}", time.elapsed()); +} diff --git a/models/clip/common/src/image.rs b/models/clip/common/src/image.rs index e55ac490..17f41dd7 100644 --- a/models/clip/common/src/image.rs +++ b/models/clip/common/src/image.rs @@ -160,7 +160,6 @@ where } /// NHWC rgb Tensor -> NCHW value Tensor - #[inline] pub fn to_nchw(&self) -> Tensor<&[u8]> { self.0 .destruct_array() @@ -176,6 +175,7 @@ impl ImageGrid { &self.whole } + #[inline] pub fn grid(&self) -> [usize; 2] { if let Some(grid) = &self.grid { let &[y, x, _, _] = grid.shape() else { diff --git a/models/clip/common/src/lib.rs b/models/clip/common/src/lib.rs index 55adc1d9..c7e06180 100644 --- a/models/clip/common/src/lib.rs +++ b/models/clip/common/src/lib.rs @@ -1,9 +1,11 @@ mod image; +mod storage; use gguf::ggml_quants::digit_layout::DigitLayout; use tensor::Tensor; pub use image::{Image, ImageGrid}; +pub use storage::Storage as ClipStorage; #[derive(Clone, Debug)] pub struct ClipMeta { @@ -21,6 +23,8 @@ pub struct ClipMeta { pub d: usize, pub di: usize, + pub image_mean: [f32; 3], + pub image_std: [f32; 3], pub epsilon: f32, } @@ -63,7 +67,13 @@ impl ClipMeta { } } - pub fn embd(&self) -> Tensor { - Tensor::new(self.dt_embd, &[self.n_patch(), self.n_mmproj_embd()]) + pub fn patch_embd(&self) -> Tensor { + let &Self { d, d_patch, .. } = self; + Tensor::new(self.dt_mat, &[d, 3, d_patch, d_patch]) + } + + pub fn patch_embd_bias(&self) -> Tensor { + let &Self { d, .. } = self; + Tensor::new(self.dt_bias, &[d]) } } diff --git a/models/clip/common/src/storage.rs b/models/clip/common/src/storage.rs index 2aeb3c03..5580624b 100644 --- a/models/clip/common/src/storage.rs +++ b/models/clip/common/src/storage.rs @@ -1,18 +1,18 @@ use crate::{ClipMeta, ProjectorType}; use gguf::{GGufMetaMapExt, GGufModel}; -use std::marker::PhantomData; #[derive(Clone)] pub struct Storage { pub meta: ClipMeta, - _phantom: PhantomData, + pub patch_embd_w: T, + pub patch_embd_b: T, } impl<'a> Storage<&'a [u8]> { pub fn from_gguf(gguf: &GGufModel<'a>) -> Self { let position_embd = &gguf.tensors["v.position_embd.weight"]; - let w_patch_embd = &gguf.tensors["v.patch_embd.weight"]; - let b_patch_embd = &gguf.tensors["v.patch_embd.bias"]; + let patch_embd_w = &gguf.tensors["v.patch_embd.weight"]; + let patch_embd_b = &gguf.tensors["v.patch_embd.bias"]; let projector = match gguf.get_str("clip.projector_type").unwrap() { "mlp" => ProjectorType::Mlp, @@ -28,8 +28,8 @@ impl<'a> Storage<&'a [u8]> { minicpmv_version: gguf.get_usize("clip.minicpmv_version").unwrap() as _, dt_embd: position_embd.ty, - dt_mat : w_patch_embd.ty, - dt_bias: b_patch_embd.ty, + dt_mat : patch_embd_w.ty, + dt_bias: patch_embd_b.ty, nblk : gguf.get_usize("clip.vision.block_count" ).unwrap(), d_patch: gguf.get_usize("clip.vision.patch_size" ).unwrap(), @@ -37,12 +37,36 @@ impl<'a> Storage<&'a [u8]> { nh : gguf.get_usize("clip.vision.attention.head_count" ).unwrap(), d : gguf.get_usize("clip.vision.embedding_length" ).unwrap(), di : gguf.get_usize("clip.vision.feed_forward_length" ).unwrap(), - epsilon: gguf.get_f32 ("clip.vision.attention.layer_norm_epsilon").unwrap(), + + image_mean: get_rgb(gguf, "clip.vision.image_mean"), + image_std : get_rgb(gguf, "clip.vision.image_std" ), + epsilon : gguf.get_f32("clip.vision.attention.layer_norm_epsilon").unwrap(), }; Self { meta, - _phantom: PhantomData, + patch_embd_w: patch_embd_w.data, + patch_embd_b: patch_embd_b.data, } } } + +fn get_rgb(gguf: &GGufModel, key: &str) -> [f32; 3] { + let mut arr = gguf.get_f32_arr(key).unwrap(); + let mut ans = [0.0; 3]; + for x in ans.iter_mut() { + *x = arr.next().unwrap().unwrap(); + } + ans +} + +#[test] +fn test() { + use test_utils::Inference; + let Some(Inference { model, .. }) = Inference::load() else { + return; + }; + let gguf = GGufModel::read(model.iter().map(|s| &**s)); + let storage = Storage::from_gguf(&gguf); + println!("{:#?}", storage.meta); +} diff --git a/models/llama/common/src/storage.rs b/models/llama/common/src/storage.rs index 3535a30a..4cf78baa 100644 --- a/models/llama/common/src/storage.rs +++ b/models/llama/common/src/storage.rs @@ -215,12 +215,12 @@ impl<'w> BlkStorage<&'w [u8]> { } #[test] -fn test_load() { +fn test() { use test_utils::Inference; let Some(Inference { model, .. }) = Inference::load() else { return; }; let gguf = GGufModel::read(model.iter().map(|s| &**s)); - let llama = Storage::from_gguf(&gguf); - println!("{:?}", llama.meta); + let storage = Storage::from_gguf(&gguf); + println!("{:#?}", storage.meta); }