From 8f5e61884143c2dacd08f410e5580d0e6dd9fd51 Mon Sep 17 00:00:00 2001 From: Himadri Bhattacharjee <107522312+lavafroth@users.noreply.github.com> Date: Mon, 14 Jul 2025 17:36:23 +0530 Subject: [PATCH] fix: reshape tokenized f32 vector dimensions to `hidden_size` --- Cargo.toml | 2 +- src/embed.rs | 35 +++++++++++++++++++++++++---------- src/main.rs | 2 +- 3 files changed, 27 insertions(+), 12 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index becc7d8..3e343b1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "silos" -version = "5.0.0" +version = "5.1.0" edition = "2024" [dependencies] diff --git a/src/embed.rs b/src/embed.rs index d24b8d8..8d4ec4b 100644 --- a/src/embed.rs +++ b/src/embed.rs @@ -7,17 +7,24 @@ use hf_hub::Repo; use hf_hub::RepoType; use hf_hub::api::sync::Api; use std::path::PathBuf; -use tokenizers::Tokenizer; -use tokenizers::TokenizerImpl; +use tokenizers::DecoderWrapper; use tokenizers::ModelWrapper; use tokenizers::NormalizerWrapper; -use tokenizers::PreTokenizerWrapper; use tokenizers::PostProcessorWrapper; -use tokenizers::DecoderWrapper; +use tokenizers::PreTokenizerWrapper; +use tokenizers::Tokenizer; +use tokenizers::TokenizerImpl; pub struct Embed { model: BertModel, - tokenizer: TokenizerImpl, + pub hidden_size: usize, + tokenizer: TokenizerImpl< + ModelWrapper, + NormalizerWrapper, + PreTokenizerWrapper, + PostProcessorWrapper, + DecoderWrapper, + >, } impl Embed { @@ -41,9 +48,14 @@ impl Embed { let tokenizer = tokenizer .with_padding(None) .with_truncation(None) - .map_err(E::msg)?.clone(); + .map_err(E::msg)? + .clone(); - Ok(Embed { model, tokenizer }) + Ok(Embed { + model, + tokenizer, + hidden_size: config.hidden_size, + }) } fn download_model_files(model_id: &str, revision: &str) -> Result<(PathBuf, PathBuf, PathBuf)> { @@ -58,7 +70,8 @@ impl Embed { } pub(crate) fn embed(&self, prompt: &str) -> Result> { - let tokens = self.tokenizer + let tokens = self + .tokenizer .encode(prompt, true) .map_err(E::msg)? .get_ids() @@ -68,9 +81,11 @@ impl Embed { let token_type_ids = token_ids.zeros_like()?; let embeddings = self.model.forward(&token_ids, &token_type_ids, None)?; - let (_n_sentence, n_tokens, _hidden_size) = embeddings.dims3()?; + let (_n_sentence, n_tokens, hidden_size) = embeddings.dims3()?; let embeddings = (embeddings.sum(1)? / (n_tokens as f64))?; - let embeddings = normalize_l2(&embeddings)?.reshape(384)?.to_vec1::()?; + let embeddings = normalize_l2(&embeddings)? + .reshape(hidden_size)? + .to_vec1::()?; Ok(embeddings) } } diff --git a/src/main.rs b/src/main.rs index 92fa7c3..3e31dad 100644 --- a/src/main.rs +++ b/src/main.rs @@ -34,7 +34,7 @@ async fn main() -> Result<()> { let (model_id, revision) = args.resolve_model_and_revision(); let embed = embed::Embed::new(args.gpu, &model_id, &revision)?; let mut dict = HashMap::default(); - let dimensions = 384; + let dimensions = embed.hidden_size; let paths = glob::glob(&format!("{}/generate/*/*.kdl", args.snippets))?; for path in paths {