added nlp module

2026-06-07 10:01:12 -03:00 · 2022-02-27 13:42:07 -06:00
parent ca4d8f0c52
commit eed59e1da5
7 changed files with 949 additions and 0 deletions
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -30,6 +30,7 @@ mod macros;
 mod url;
 mod response;
 mod message;
+mod nlp;

 /// Alias for tokio::sync::mpsc::UnboundedSender<Command>
 pub(crate) type CommandSender = UnboundedSender<Command>;
--- a/src/nlp/constants.rs
+++ b/src/nlp/constants.rs
@@ -0,0 +1,334 @@
+use lazy_static::lazy_static;
+use regex::Regex;
+
+lazy_static! {
+    /// regular expression to match on words with numbers, underscores, and hyphens
+    pub(super) static ref BOUNDED_WORD_REGEX: Regex = Regex::new(r"\b[a-zA-Z0-9_-]+\b").unwrap();
+}
+
+/// collection of stop words from spaCy with small modifications
+pub(super) static STOP_WORDS: [&str; 323] = [
+    "'d",
+    "'ll",
+    "'m",
+    "'re",
+    "'s",
+    "'ve",
+    "a",
+    "about",
+    "above",
+    "across",
+    "after",
+    "afterwards",
+    "again",
+    "against",
+    "almost",
+    "alone",
+    "along",
+    "already",
+    "also",
+    "although",
+    "always",
+    "am",
+    "among",
+    "amongst",
+    "amount",
+    "an",
+    "and",
+    "another",
+    "any",
+    "anyhow",
+    "anyone",
+    "anything",
+    "anyway",
+    "anywhere",
+    "are",
+    "around",
+    "as",
+    "at",
+    "back",
+    "be",
+    "became",
+    "because",
+    "become",
+    "becomes",
+    "becoming",
+    "been",
+    "before",
+    "beforehand",
+    "behind",
+    "being",
+    "below",
+    "beside",
+    "besides",
+    "between",
+    "beyond",
+    "both",
+    "bottom",
+    "but",
+    "by",
+    "ca",
+    "call",
+    "can",
+    "cannot",
+    "could",
+    "did",
+    "do",
+    "does",
+    "doing",
+    "done",
+    "down",
+    "due",
+    "during",
+    "each",
+    "eight",
+    "either",
+    "eleven",
+    "else",
+    "elsewhere",
+    "empty",
+    "enough",
+    "even",
+    "ever",
+    "every",
+    "everyone",
+    "everything",
+    "everywhere",
+    "except",
+    "few",
+    "fifteen",
+    "fifty",
+    "first",
+    "five",
+    "for",
+    "former",
+    "formerly",
+    "forty",
+    "four",
+    "from",
+    "front",
+    "full",
+    "further",
+    "get",
+    "got",
+    "give",
+    "go",
+    "had",
+    "has",
+    "have",
+    "he",
+    "hence",
+    "her",
+    "here",
+    "hereafter",
+    "hereby",
+    "herein",
+    "hereupon",
+    "hers",
+    "herself",
+    "him",
+    "himself",
+    "his",
+    "how",
+    "however",
+    "hundred",
+    "i",
+    "if",
+    "in",
+    "indeed",
+    "into",
+    "is",
+    "it",
+    "its",
+    "itself",
+    "just",
+    "keep",
+    "last",
+    "latter",
+    "latterly",
+    "least",
+    "less",
+    "made",
+    "make",
+    "many",
+    "may",
+    "me",
+    "meanwhile",
+    "might",
+    "mine",
+    "more",
+    "moreover",
+    "most",
+    "mostly",
+    "move",
+    "much",
+    "must",
+    "my",
+    "myself",
+    "n't",
+    "name",
+    "namely",
+    "neither",
+    "never",
+    "nevertheless",
+    "next",
+    "nine",
+    "no",
+    "nobody",
+    "none",
+    "noone",
+    "nor",
+    "not",
+    "nothing",
+    "now",
+    "nowhere",
+    "n\u{2018}t",
+    "n\u{2019}t",
+    "of",
+    "off",
+    "often",
+    "on",
+    "once",
+    "one",
+    "only",
+    "onto",
+    "or",
+    "other",
+    "others",
+    "otherwise",
+    "our",
+    "ours",
+    "ourselves",
+    "out",
+    "over",
+    "own",
+    "part",
+    "per",
+    "perhaps",
+    "please",
+    "put",
+    "quite",
+    "rather",
+    "re",
+    "really",
+    "regarding",
+    "same",
+    "say",
+    "see",
+    "seem",
+    "seemed",
+    "seeming",
+    "seems",
+    "serious",
+    "several",
+    "she",
+    "should",
+    "side",
+    "since",
+    "six",
+    "sixty",
+    "so",
+    "some",
+    "somehow",
+    "someone",
+    "something",
+    "sometime",
+    "sometimes",
+    "somewhere",
+    "still",
+    "such",
+    "take",
+    "ten",
+    "than",
+    "that",
+    "the",
+    "their",
+    "them",
+    "themselves",
+    "then",
+    "thence",
+    "there",
+    "thereafter",
+    "thereby",
+    "therefore",
+    "therein",
+    "thereupon",
+    "these",
+    "they",
+    "third",
+    "this",
+    "those",
+    "though",
+    "three",
+    "through",
+    "throughout",
+    "thru",
+    "thus",
+    "to",
+    "together",
+    "too",
+    "toward",
+    "towards",
+    "twelve",
+    "twenty",
+    "two",
+    "under",
+    "unless",
+    "until",
+    "up",
+    "upon",
+    "used",
+    "using",
+    "various",
+    "very",
+    "via",
+    "was",
+    "we",
+    "well",
+    "were",
+    "what",
+    "whatever",
+    "when",
+    "whence",
+    "whenever",
+    "where",
+    "whereafter",
+    "whereas",
+    "whereby",
+    "wherein",
+    "whereupon",
+    "wherever",
+    "whether",
+    "which",
+    "while",
+    "whither",
+    "who",
+    "whoever",
+    "whole",
+    "whom",
+    "whose",
+    "why",
+    "will",
+    "with",
+    "within",
+    "without",
+    "would",
+    "yet",
+    "you",
+    "your",
+    "yours",
+    "yourself",
+    "yourselves",
+    "\u{2018}d",
+    "\u{2018}ll",
+    "\u{2018}m",
+    "\u{2018}re",
+    "\u{2018}s",
+    "\u{2018}ve",
+    "\u{2019}d",
+    "\u{2019}ll",
+    "\u{2019}m",
+    "\u{2019}re",
+    "\u{2019}s",
+    "\u{2019}ve",
+];
--- a/src/nlp/document.rs
+++ b/src/nlp/document.rs
@@ -0,0 +1,200 @@
+use super::term::{Term, TermMetaData};
+use super::utils::preprocess;
+use scraper::{Html, Node, Selector};
+use std::collections::HashMap;
+
+/// data container representing a single document, in the nlp sense
+#[derive(Debug, Default)]
+pub struct Document {
+    /// collection of `Term`s and their associated metadata
+    pub terms: HashMap<Term, TermMetaData>,
+
+    /// number of terms contained within the document
+    number_of_terms: usize,
+}
+
+impl Document {
+    /// create a new `Document` from the given string
+    pub fn new(text: &str) -> Self {
+        let mut document = Self::default();
+
+        let processed = preprocess(text);
+
+        document.number_of_terms += processed.len();
+
+        for normalized in processed {
+            if normalized.len() > 2 {
+                document.add_term(&normalized)
+            }
+        }
+        document
+    }
+
+    /// add a `Term` to the document if it's not already tracked, otherwise increment the number
+    /// of times the term has been seen
+    pub fn add_term(&mut self, word: &str) {
+        let term = Term::new(word);
+
+        let metadata = self.terms.entry(term).or_insert_with(TermMetaData::new);
+        *metadata.count_mut() += 1;
+    }
+
+    /// create a new `Document` from the given HTML string
+    pub fn from_html(raw_html: &str) -> Self {
+        let selector = Selector::parse("body").unwrap();
+
+        let html = Html::parse_document(raw_html);
+
+        let text = html
+            .select(&selector)
+            .next()
+            .unwrap()
+            .descendants()
+            .filter_map(|node| {
+                if !node.value().is_text() && !node.value().is_comment() {
+                    return None;
+                }
+
+                // have a Text||Comment node, trim whitespace to test for all whitespace stuff
+                let trimmed = if node.value().is_text() {
+                    node.value().as_text().unwrap().text.trim()
+                } else {
+                    node.value().as_comment().unwrap().comment.trim()
+                };
+
+                if trimmed.is_empty() {
+                    return None;
+                }
+
+                // found a non-empty Text||Comment node, need to check its parent to determine if
+                // it's a <script>||<style> tag. We're assuming text within a script||style tag is
+                // uninteresting
+
+                let parent = node.parent().unwrap().value();
+
+                if !parent.is_element() {
+                    return None;
+                }
+
+                // parent is an Element node, see if it's a <script> or <style>
+
+                if let Node::Element(element) = parent {
+                    if element.name() == "script" || element.name() == "style" {
+                        return None;
+                    }
+
+                    // at this point, we have a non-empty Text element with a non-script|style parent;
+                    // now we can return the trimmed up string
+                    return Some(format!("{} ", trimmed));
+                }
+
+                // not an Element node
+                None
+            })
+            .collect::<String>();
+
+        // call `new` to push the parsed html through the pre-processing pipeline and process all
+        // the words
+        Self::new(&text)
+    }
+
+    /// Log normalized weighting scheme for term frequency
+    pub fn term_frequency(&self, term: &Term) -> f32 {
+        if let Some(metadata) = self.terms.get(term) {
+            metadata.count() as f32 / self.number_of_terms() as f32
+        } else {
+            0.0
+        }
+    }
+
+    /// immutable reference to the collection of terms and their metadata
+    pub(super) fn terms(&self) -> &HashMap<Term, TermMetaData> {
+        &self.terms
+    }
+
+    /// number of terms the current document knows about
+    pub(super) fn number_of_terms(&self) -> usize {
+        self.number_of_terms
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    /// `Document::new` should preprocess text and generate a hashmap of `Term, TermMetadata`
+    fn nlp_document_creation_from_text() {
+        let doc = Document::new("The air quality in Singapore got worse on Wednesday.");
+
+        let expected_terms = ["air", "quality", "singapore", "worse", "wednesday"];
+
+        for expected in expected_terms {
+            let term = Term::new(expected);
+            assert!(doc.terms().contains_key(&term));
+            assert_eq!(doc.number_of_terms, 5);
+            assert_eq!(doc.terms().get(&term).unwrap().count(), 1);
+
+            // since term frequencies aren't calculated on `new`, document frequency is zero in
+            // addition to the empty term_frequencies slice
+            let empty: &[f32] = &[];
+            assert_eq!(doc.terms().get(&term).unwrap().term_frequencies(), empty);
+            assert_eq!(doc.terms().get(&term).unwrap().document_frequency(), 0);
+        }
+    }
+
+    #[test]
+    /// `Document::new` should preprocess html and generate a hashmap of `Term, TermMetadata`
+    fn nlp_document_creation_from_html() {
+        let empty = Document::from_html("<html></html>");
+        assert_eq!(empty.number_of_terms, 0);
+
+        let doc = Document::from_html(
+            "<html><body><p>The air quality in Singapore got worse on Wednesday.</p></body></html>",
+        );
+
+        let expected_terms = ["air", "quality", "singapore", "worse", "wednesday"];
+
+        for expected in expected_terms {
+            let term = Term::new(expected);
+            assert_eq!(doc.number_of_terms, 5);
+            assert!(doc.terms().contains_key(&term));
+            assert_eq!(doc.terms().get(&term).unwrap().count(), 1);
+
+            // since term frequencies aren't calculated on `new`, document frequency is zero in
+            // addition to the empty term_frequencies slice
+            let empty: &[f32] = &[];
+            assert_eq!(doc.terms().get(&term).unwrap().term_frequencies(), empty);
+            assert_eq!(doc.terms().get(&term).unwrap().document_frequency(), 0);
+        }
+    }
+
+    #[test]
+    /// simple check of the `term_frequency` function's return value
+    fn term_frequency_validation() {
+        let doc = Document::new("The air quality in Singapore got worse on Wednesday. Air Jordan.");
+
+        let air_freq = doc.term_frequency(&Term::new("air"));
+
+        let abs_diff = (air_freq - 0.2857143).abs();
+        assert!(abs_diff <= f32::EPSILON);
+
+        let non_existent = doc.term_frequency(&Term::new("derpatronic"));
+        assert_eq!(non_existent, 0.0);
+    }
+
+    #[test]
+    /// test accessors for correctness
+    fn document_accessor_test() {
+        let doc = Document::new("The air quality in Singapore got worse on Wednesday.");
+        let keys = doc.terms().keys().map(|key| key.raw()).collect::<Vec<_>>();
+
+        let expected = ["air", "quality", "singapore", "worse", "wednesday"];
+
+        assert_eq!(doc.number_of_terms(), 5);
+
+        for key in keys {
+            assert!(expected.contains(&key));
+        }
+    }
+}
--- a/src/nlp/mod.rs
+++ b/src/nlp/mod.rs
@@ -0,0 +1,11 @@
+//! small stand-alone tf-idf library, specifically designed for use in feroxbuster
+
+mod constants;
+mod document;
+mod model;
+mod term;
+mod utils;
+
+pub(crate) use self::document::Document;
+pub(crate) use self::model::TfIdf;
+pub(crate) use self::term::Term;
--- a/src/nlp/model.rs
+++ b/src/nlp/model.rs
@@ -0,0 +1,140 @@
+use super::document::Document;
+use super::term::{Term, TermMetaData};
+use super::utils::{inverse_document_frequency, tf_idf_score};
+use std::borrow::{Borrow, BorrowMut};
+use std::collections::HashMap;
+
+/// data container for the TF-IDF model
+#[derive(Debug, Default)]
+pub(crate) struct TfIdf {
+    /// collection of `Term`s and their associated metadata
+    terms: HashMap<Term, TermMetaData>,
+
+    /// number of documents processed by the model
+    num_documents: usize,
+}
+
+impl TfIdf {
+    /// create an empty TF-IDF model; must be populated with `add_document` prior to use
+    fn new() -> Self {
+        Self {
+            terms: HashMap::new(),
+            num_documents: 0,
+        }
+    }
+
+    /// accessor method for the collection of `Term`s and `TermMetaData`
+    fn terms(&self) -> &HashMap<Term, TermMetaData> {
+        self.terms.borrow()
+    }
+
+    /// add a `Document` to the model
+    fn add_document(&mut self, document: Document) {
+        // increment number of docs seen, since we don't preserve the document itself; this needs
+        // to happen before calls to `self.inverse_document_frequency`, as it relies on the count
+        // being up to date
+        self.num_documents += 1;
+
+        for (term, doc_metadata) in document.terms().iter() {
+            // an incoming `Term` from a `Document` only has a valid `count` for that particular
+            // document; need to get the term frequency while both are known/valid
+            let term_frequency = document.term_frequency(term);
+
+            let metadata = self
+                .terms
+                .entry(term.clone())
+                .or_insert_with(|| doc_metadata.to_owned());
+
+            metadata.term_frequencies_mut().push(term_frequency);
+        }
+    }
+
+    /// (re)-calculate tf-idf scores for all terms, given the current number of documents
+    ///
+    /// # Notes
+    ///
+    /// old tf-idf scores are removed during calculations to keep new `Term`s at the same relative
+    /// level as new ones WRT corpus size
+    fn calculate_tf_idf_scores(&mut self) {
+        for metadata in self.terms.borrow_mut().values_mut() {
+            let num_frequencies = metadata.term_frequencies().len();
+
+            // clear out old scores before recalculating
+            metadata.tf_idf_scores_mut().clear();
+
+            let mut to_add = Vec::with_capacity(num_frequencies);
+
+            for frequency in metadata.term_frequencies() {
+                let idf = inverse_document_frequency(
+                    self.num_documents as f32,
+                    metadata.document_frequency() as f32,
+                );
+
+                let score = tf_idf_score(*frequency, idf);
+                to_add.push(score);
+            }
+
+            metadata.tf_idf_scores_mut().append(&mut to_add);
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    /// helper for this test suite
+    fn get_scores(word: &str, model: &TfIdf) -> Vec<f32> {
+        model
+            .terms()
+            .get(&Term::new(word))
+            .unwrap()
+            .tf_idf_scores()
+            .into()
+    }
+
+    #[test]
+    /// given the example data at https://remykarem.github.io/tfidf-demo/, ensure the model
+    /// produces the same results
+    fn model_generates_expected_tf_idf_scores() {
+        let one = "Air quality in the sunny island improved gradually throughout Wednesday.";
+        let two =
+            "Air quality in Singapore on Wednesday continued to get worse as haze hit the island.";
+        let three = "The air quality in Singapore is monitored through a network of air monitoring stations located in different parts of the island";
+        let four = "The air quality in Singapore got worse on Wednesday.";
+
+        let docs = [one, two, three, four];
+        let mut model = TfIdf::new();
+
+        for doc in docs.iter() {
+            let d = Document::new(doc);
+            model.add_document(d);
+        }
+
+        assert_eq!(model.terms().len(), 19);
+
+        model.calculate_tf_idf_scores();
+
+        assert_eq!(get_scores("quality", &model), [0.0, 0.0, 0.0, 0.0]);
+        assert_eq!(get_scores("air", &model), [0.0, 0.0, 0.0, 0.0]);
+        assert_eq!(
+            get_scores("wednesday", &model),
+            [0.017848395, 0.013882084, 0.024987752]
+        );
+        assert_eq!(
+            get_scores("island", &model),
+            [0.017848395, 0.013882084, 0.010411563]
+        );
+        assert_eq!(
+            get_scores("singapore", &model),
+            [0.013882084, 0.010411563, 0.024987752]
+        );
+        assert_eq!(get_scores("sunny", &model), [0.08600858]);
+        assert_eq!(get_scores("monitoring", &model), [0.05017167]);
+        assert_eq!(get_scores("stations", &model), [0.05017167]);
+        assert_eq!(get_scores("parts", &model), [0.05017167]);
+        assert_eq!(get_scores("haze", &model), [0.06689556]);
+        assert_eq!(get_scores("hit", &model), [0.06689556]);
+        assert_eq!(get_scores("worse", &model), [0.03344778, 0.060206003]);
+    }
+}
--- a/src/nlp/term.rs
+++ b/src/nlp/term.rs
@@ -0,0 +1,105 @@
+use std::borrow::BorrowMut;
+
+/// single word term for text processing
+#[derive(Debug, Hash, Eq, PartialEq, Default, Clone)]
+pub struct Term {
+    /// underlying string that the term represents
+    raw: String,
+}
+
+impl Term {
+    /// given a word, create a new `Term`
+    pub fn new(word: &str) -> Self {
+        Self {
+            raw: word.to_owned(),
+        }
+    }
+
+    /// return a reference to the underlying string
+    pub fn raw(&self) -> &str {
+        &self.raw
+    }
+}
+
+/// metadata to be associated with a `Term`
+#[derive(Debug, Clone, Default)]
+pub struct TermMetaData {
+    /// number of times the associated `Term` was seen in a single document
+    count: u32,
+
+    /// collection of term frequencies for the associated `Term`
+    term_frequencies: Vec<f32>,
+
+    /// collection of tf-idf scores for the associated `Term`
+    tf_idf_scores: Vec<f32>,
+}
+
+impl TermMetaData {
+    /// create a new metadata container
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    /// number of times a `Term` has appeared in any `Document` within the corpus
+    pub fn document_frequency(&self) -> usize {
+        self.term_frequencies().len()
+    }
+
+    /// mutable reference to the collection of term frequencies
+    pub fn term_frequencies_mut(&mut self) -> &mut Vec<f32> {
+        self.term_frequencies.borrow_mut()
+    }
+
+    /// immutable reference to the collection of term frequencies
+    pub fn term_frequencies(&self) -> &[f32] {
+        &self.term_frequencies
+    }
+
+    /// mutable reference to the number of times a `Term` was seen in a particular `Document`
+    pub fn count_mut(&mut self) -> &mut u32 {
+        self.count.borrow_mut()
+    }
+
+    /// number of times a `Term` was seen in a particular `Document`
+    pub fn count(&self) -> u32 {
+        self.count
+    }
+
+    /// mutable reference to the collection of tf-idf scores
+    pub fn tf_idf_scores_mut(&mut self) -> &mut Vec<f32> {
+        self.tf_idf_scores.borrow_mut()
+    }
+
+    /// immutable reference to the collection of tf-idf scores
+    pub fn tf_idf_scores(&self) -> &[f32] {
+        &self.tf_idf_scores
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    /// test accessors for correctness
+    fn nlp_term_accessor_test() {
+        let term = Term::new("stuff");
+        assert_eq!(term.raw(), "stuff");
+    }
+
+    #[test]
+    /// test accessors for correctness
+    fn nlp_term_metadata_accessor_test() {
+        let mut metadata = TermMetaData::new();
+
+        *metadata.count_mut() += 1;
+        assert_eq!(metadata.count(), 1);
+
+        metadata.term_frequencies_mut().push(1.0);
+        assert_eq!(metadata.document_frequency(), 1);
+        assert_eq!(metadata.term_frequencies().first().unwrap(), &1.0);
+
+        metadata.tf_idf_scores_mut().push(1.0);
+        assert_eq!(metadata.tf_idf_scores().first().unwrap(), &1.0);
+    }
+}
--- a/src/nlp/utils.rs
+++ b/src/nlp/utils.rs
@@ -0,0 +1,158 @@
+use super::constants::{BOUNDED_WORD_REGEX, STOP_WORDS};
+use regex::Captures;
+use std::borrow::Cow;
+
+/// pre-processing pipeline wrapper that removes punctuation, normalizes word case (utf-8 included)
+/// to lowercase, and remove stop words
+pub fn preprocess(text: &str) -> Vec<String> {
+    let text = remove_punctuation(text);
+    let text = normalize_case(text);
+    let text = remove_stop_words(&text);
+
+    text.split_whitespace()
+        .map(|word| word.to_string())
+        .collect::<Vec<_>>()
+}
+
+/// optimized version of `str::to_lowercase`
+pub fn normalize_case<'a, S: Into<Cow<'a, str>>>(input: S) -> Cow<'a, str> {
+    let input = input.into();
+
+    let first = input.find(char::is_uppercase);
+
+    if let Some(first_idx) = first {
+        let mut output = String::from(&input[..first_idx]);
+        output.reserve(input.len() - first_idx);
+
+        for c in input[first_idx..].chars() {
+            if c.is_uppercase() {
+                output.push(c.to_lowercase().next().unwrap())
+            } else {
+                output.push(c)
+            }
+        }
+
+        Cow::Owned(output)
+    } else {
+        input
+    }
+}
+
+/// remove ascii and some utf-8 punctuation characters from the given string
+pub fn remove_punctuation(text: &str) -> String {
+    // non-separator type chars can be replaced with an empty string, while separators are replaced
+    // with a space. This attempts to keep things like
+    // 'aboutblogfaqcontactpresstermslexicondisclosure' from happening
+    text.replace(
+        [
+            '!', '\\', '"', '#', '$', '%', '&', '(', ')', '*', '+', ':', ';', '<', '=', '>', '?',
+            '@', '[', ']', '^', '{', '}', '|', '~', ',', '\'', '“', '”', '’', '‘', '’', '‘',
+        ],
+        "",
+    )
+    .replace(['/', '–', '—', '.'], " ")
+}
+
+/// remove stop words from the given string
+pub fn remove_stop_words(text: &str) -> String {
+    BOUNDED_WORD_REGEX
+        .replace_all(text, |caps: &Captures| {
+            let word = &caps[0];
+            if !STOP_WORDS.contains(&word) {
+                word.to_owned()
+            } else {
+                String::new()
+            }
+        })
+        .into()
+}
+
+/// calculate inverse document frequency
+pub fn inverse_document_frequency(num_docs: f32, doc_frequency: f32) -> f32 {
+    f32::log10(num_docs / doc_frequency)
+}
+
+/// calculate term frequency-inverse document frequency (tf-idf)
+pub fn tf_idf_score(term_frequency: f32, idf: f32) -> f32 {
+    term_frequency * idf
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    /// ensure all expected punctuation characters are removed
+    fn test_remove_punctuation() {
+        let tester = "!\\\"#$%&()*+/:;<=>?@[]^{}|~,.'“”’‘–—\n‘’";
+        // the `"    \n"` is because of the things like / getting replaced with a space
+        assert_eq!(remove_punctuation(tester), "    \n");
+    }
+
+    #[test]
+    /// ensure uppercase characters are swapped to lowercase
+    fn test_normalize_case() {
+        let tester = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
+        assert_eq!(normalize_case(tester), "abcdefghijklmnopqrstuvwxyz");
+    }
+
+    #[test]
+    /// ensure all stop words are removed from the list of stopwords ... intestuous
+    fn test_remove_stopwords() {
+        let all_words = STOP_WORDS
+            .iter()
+            .map(|&word| word.to_string())
+            .collect::<Vec<_>>()
+            .join(" ");
+
+        let removed = remove_stop_words(&all_words).replace(' ', "");
+
+        // the remaining chars are from the contraction-based stop words
+        assert_eq!(removed, "'d'll'm''s'ven'tn‘tn’t‘d‘ll‘m‘‘s‘ve’d’ll’m’’s’ve");
+    }
+
+    #[test]
+    /// ensure preprocess
+    fn test_preprocess_results() {
+        let tester = "WHY are Y'all YELLing?";
+        assert_eq!(&preprocess(tester), &["yall", "yelling"]);
+    }
+
+    #[test]
+    /// ensure our calculations conform to the example provided at the link below
+    ///
+    /// https://www.kaggle.com/paulrohan2020/tf-idf-tutorial/notebook#TF-IDF-Model
+    ///
+    /// Consider a document containing 100 words wherein the word cat appears 3 times.
+    /// The term frequency (i.e., tf) for cat is then (3 / 100) = 0.03. Now, assume we have 10
+    /// million documents and the word cat appears in one thousand of these. Then, the inverse
+    /// document frequency (i.e., idf) is calculated as log(10,000,000 / 1,000) = 4. Thus, the
+    /// Tf-idf weight is the product of these quantities: 0.03 * 4 = 0.12.
+    fn idf_returns_expected_value() {
+        let num_docs = 10_000_000_f32;
+        let num_occurrences = 1_000_f32;
+        let abs_diff = (inverse_document_frequency(num_docs, num_occurrences) - 4.0).abs();
+
+        assert!(abs_diff <= f32::EPSILON);
+    }
+
+    #[test]
+    /// ensure our calculations conform to the example provided at the link below
+    ///
+    /// https://www.kaggle.com/paulrohan2020/tf-idf-tutorial/notebook#TF-IDF-Model
+    ///
+    /// Consider a document containing 100 words wherein the word cat appears 3 times.
+    /// The term frequency (i.e., tf) for cat is then (3 / 100) = 0.03. Now, assume we have 10
+    /// million documents and the word cat appears in one thousand of these. Then, the inverse
+    /// document frequency (i.e., idf) is calculated as log(10,000,000 / 1,000) = 4. Thus, the
+    /// Tf-idf weight is the product of these quantities: 0.03 * 4 = 0.12.
+    fn tf_idf_returns_expected_value() {
+        let term_freq = 0.03_f32;
+        let num_docs = 10_000_000_f32;
+        let num_occurrences = 1_000_f32;
+        let idf = inverse_document_frequency(num_docs, num_occurrences);
+        let abs_diff = (tf_idf_score(term_freq, idf) - 0.12).abs();
+
+        assert!(abs_diff <= f32::EPSILON);
+    }
+}