mirror of
https://github.com/epi052/feroxbuster.git
synced 2026-06-03 06:41:12 -03:00
touched up a few minor issues in nlp
This commit is contained in:
@@ -20,11 +20,10 @@ impl Document {
|
||||
|
||||
let processed = preprocess(text);
|
||||
|
||||
document.number_of_terms += processed.len();
|
||||
|
||||
for normalized in processed {
|
||||
if normalized.len() >= 2 {
|
||||
document.add_term(&normalized)
|
||||
document.add_term(&normalized);
|
||||
document.number_of_terms += 1;
|
||||
}
|
||||
}
|
||||
document
|
||||
|
||||
@@ -73,7 +73,11 @@ impl TfIdf {
|
||||
to_add.push(score);
|
||||
}
|
||||
|
||||
let average: f32 = to_add.iter().sum::<f32>() / to_add.len() as f32;
|
||||
let average = if to_add.is_empty() {
|
||||
0.0
|
||||
} else {
|
||||
to_add.iter().sum::<f32>() / to_add.len() as f32
|
||||
};
|
||||
|
||||
*metadata.tf_idf_score_mut() = average;
|
||||
}
|
||||
|
||||
@@ -22,6 +22,15 @@ impl Term {
|
||||
}
|
||||
|
||||
/// metadata to be associated with a `Term`
|
||||
///
|
||||
/// # Design Note
|
||||
///
|
||||
/// The `count` field represents the number of times a term appeared in a **single document**
|
||||
/// and is only meaningful in the per-document context (i.e., within a `Document`).
|
||||
///
|
||||
/// When `TermMetaData` is stored in the global `TfIdf` model, the `count` field becomes stale
|
||||
/// and is not used. Instead, the model relies on `term_frequencies` (which tracks the term
|
||||
/// frequency for each document the term appears in) and calculates TF-IDF scores from those.
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub(super) struct TermMetaData {
|
||||
/// number of times the associated `Term` was seen in a single document
|
||||
|
||||
Reference in New Issue
Block a user