From 38ab434642a6153417abf8a885ab0bf6cf9b5694 Mon Sep 17 00:00:00 2001 From: epi <43392618+epi052@users.noreply.github.com> Date: Sun, 16 Nov 2025 08:03:58 -0500 Subject: [PATCH] touched up a few minor issues in nlp --- src/nlp/document.rs | 5 ++--- src/nlp/model.rs | 6 +++++- src/nlp/term.rs | 9 +++++++++ 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/src/nlp/document.rs b/src/nlp/document.rs index c10e9e2..85a163f 100644 --- a/src/nlp/document.rs +++ b/src/nlp/document.rs @@ -20,11 +20,10 @@ impl Document { let processed = preprocess(text); - document.number_of_terms += processed.len(); - for normalized in processed { if normalized.len() >= 2 { - document.add_term(&normalized) + document.add_term(&normalized); + document.number_of_terms += 1; } } document diff --git a/src/nlp/model.rs b/src/nlp/model.rs index 588d2a3..d01582c 100644 --- a/src/nlp/model.rs +++ b/src/nlp/model.rs @@ -73,7 +73,11 @@ impl TfIdf { to_add.push(score); } - let average: f32 = to_add.iter().sum::() / to_add.len() as f32; + let average = if to_add.is_empty() { + 0.0 + } else { + to_add.iter().sum::() / to_add.len() as f32 + }; *metadata.tf_idf_score_mut() = average; } diff --git a/src/nlp/term.rs b/src/nlp/term.rs index 007b8ce..e37777d 100644 --- a/src/nlp/term.rs +++ b/src/nlp/term.rs @@ -22,6 +22,15 @@ impl Term { } /// metadata to be associated with a `Term` +/// +/// # Design Note +/// +/// The `count` field represents the number of times a term appeared in a **single document** +/// and is only meaningful in the per-document context (i.e., within a `Document`). +/// +/// When `TermMetaData` is stored in the global `TfIdf` model, the `count` field becomes stale +/// and is not used. Instead, the model relies on `term_frequencies` (which tracks the term +/// frequency for each document the term appears in) and calculates TF-IDF scores from those. #[derive(Debug, Clone, Default)] pub(super) struct TermMetaData { /// number of times the associated `Term` was seen in a single document