touched up a few minor issues in nlp

This commit is contained in:
epi
2025-11-16 08:03:58 -05:00
parent 72ab2d9a58
commit 38ab434642
3 changed files with 16 additions and 4 deletions

View File

@@ -20,11 +20,10 @@ impl Document {
let processed = preprocess(text);
document.number_of_terms += processed.len();
for normalized in processed {
if normalized.len() >= 2 {
document.add_term(&normalized)
document.add_term(&normalized);
document.number_of_terms += 1;
}
}
document

View File

@@ -73,7 +73,11 @@ impl TfIdf {
to_add.push(score);
}
let average: f32 = to_add.iter().sum::<f32>() / to_add.len() as f32;
let average = if to_add.is_empty() {
0.0
} else {
to_add.iter().sum::<f32>() / to_add.len() as f32
};
*metadata.tf_idf_score_mut() = average;
}

View File

@@ -22,6 +22,15 @@ impl Term {
}
/// metadata to be associated with a `Term`
///
/// # Design Note
///
/// The `count` field represents the number of times a term appeared in a **single document**
/// and is only meaningful in the per-document context (i.e., within a `Document`).
///
/// When `TermMetaData` is stored in the global `TfIdf` model, the `count` field becomes stale
/// and is not used. Instead, the model relies on `term_frequencies` (which tracks the term
/// frequency for each document the term appears in) and calculates TF-IDF scores from those.
#[derive(Debug, Clone, Default)]
pub(super) struct TermMetaData {
/// number of times the associated `Term` was seen in a single document