touched up a few minor issues in nlp

2026-06-03 06:41:12 -03:00 · 2025-11-16 08:03:58 -05:00
parent 72ab2d9a58
commit 38ab434642
3 changed files with 16 additions and 4 deletions
--- a/src/nlp/document.rs
+++ b/src/nlp/document.rs
@@ -20,11 +20,10 @@ impl Document {

        let processed = preprocess(text);

-        document.number_of_terms += processed.len();
-
        for normalized in processed {
            if normalized.len() >= 2 {
-                document.add_term(&normalized)
+                document.add_term(&normalized);
+                document.number_of_terms += 1;
            }
        }
        document
--- a/src/nlp/model.rs
+++ b/src/nlp/model.rs
@@ -73,7 +73,11 @@ impl TfIdf {
                to_add.push(score);
            }

-            let average: f32 = to_add.iter().sum::<f32>() / to_add.len() as f32;
+            let average = if to_add.is_empty() {
+                0.0
+            } else {
+                to_add.iter().sum::<f32>() / to_add.len() as f32
+            };

            *metadata.tf_idf_score_mut() = average;
        }
--- a/src/nlp/term.rs
+++ b/src/nlp/term.rs
@@ -22,6 +22,15 @@ impl Term {
 }

 /// metadata to be associated with a `Term`
+///
+/// # Design Note
+///
+/// The `count` field represents the number of times a term appeared in a **single document**
+/// and is only meaningful in the per-document context (i.e., within a `Document`).
+///
+/// When `TermMetaData` is stored in the global `TfIdf` model, the `count` field becomes stale
+/// and is not used. Instead, the model relies on `term_frequencies` (which tracks the term
+/// frequency for each document the term appears in) and calculates TF-IDF scores from those.
 #[derive(Debug, Clone, Default)]
 pub(super) struct TermMetaData {
    /// number of times the associated `Term` was seen in a single document