added nlp module

This commit is contained in:
epi
2022-02-27 13:42:07 -06:00
parent ca4d8f0c52
commit eed59e1da5
7 changed files with 949 additions and 0 deletions

View File

@@ -30,6 +30,7 @@ mod macros;
mod url;
mod response;
mod message;
mod nlp;
/// Alias for tokio::sync::mpsc::UnboundedSender<Command>
pub(crate) type CommandSender = UnboundedSender<Command>;

334
src/nlp/constants.rs Normal file
View File

@@ -0,0 +1,334 @@
use lazy_static::lazy_static;
use regex::Regex;
lazy_static! {
/// regular expression to match on words with numbers, underscores, and hyphens
pub(super) static ref BOUNDED_WORD_REGEX: Regex = Regex::new(r"\b[a-zA-Z0-9_-]+\b").unwrap();
}
/// collection of stop words from spaCy with small modifications
pub(super) static STOP_WORDS: [&str; 323] = [
"'d",
"'ll",
"'m",
"'re",
"'s",
"'ve",
"a",
"about",
"above",
"across",
"after",
"afterwards",
"again",
"against",
"almost",
"alone",
"along",
"already",
"also",
"although",
"always",
"am",
"among",
"amongst",
"amount",
"an",
"and",
"another",
"any",
"anyhow",
"anyone",
"anything",
"anyway",
"anywhere",
"are",
"around",
"as",
"at",
"back",
"be",
"became",
"because",
"become",
"becomes",
"becoming",
"been",
"before",
"beforehand",
"behind",
"being",
"below",
"beside",
"besides",
"between",
"beyond",
"both",
"bottom",
"but",
"by",
"ca",
"call",
"can",
"cannot",
"could",
"did",
"do",
"does",
"doing",
"done",
"down",
"due",
"during",
"each",
"eight",
"either",
"eleven",
"else",
"elsewhere",
"empty",
"enough",
"even",
"ever",
"every",
"everyone",
"everything",
"everywhere",
"except",
"few",
"fifteen",
"fifty",
"first",
"five",
"for",
"former",
"formerly",
"forty",
"four",
"from",
"front",
"full",
"further",
"get",
"got",
"give",
"go",
"had",
"has",
"have",
"he",
"hence",
"her",
"here",
"hereafter",
"hereby",
"herein",
"hereupon",
"hers",
"herself",
"him",
"himself",
"his",
"how",
"however",
"hundred",
"i",
"if",
"in",
"indeed",
"into",
"is",
"it",
"its",
"itself",
"just",
"keep",
"last",
"latter",
"latterly",
"least",
"less",
"made",
"make",
"many",
"may",
"me",
"meanwhile",
"might",
"mine",
"more",
"moreover",
"most",
"mostly",
"move",
"much",
"must",
"my",
"myself",
"n't",
"name",
"namely",
"neither",
"never",
"nevertheless",
"next",
"nine",
"no",
"nobody",
"none",
"noone",
"nor",
"not",
"nothing",
"now",
"nowhere",
"n\u{2018}t",
"n\u{2019}t",
"of",
"off",
"often",
"on",
"once",
"one",
"only",
"onto",
"or",
"other",
"others",
"otherwise",
"our",
"ours",
"ourselves",
"out",
"over",
"own",
"part",
"per",
"perhaps",
"please",
"put",
"quite",
"rather",
"re",
"really",
"regarding",
"same",
"say",
"see",
"seem",
"seemed",
"seeming",
"seems",
"serious",
"several",
"she",
"should",
"side",
"since",
"six",
"sixty",
"so",
"some",
"somehow",
"someone",
"something",
"sometime",
"sometimes",
"somewhere",
"still",
"such",
"take",
"ten",
"than",
"that",
"the",
"their",
"them",
"themselves",
"then",
"thence",
"there",
"thereafter",
"thereby",
"therefore",
"therein",
"thereupon",
"these",
"they",
"third",
"this",
"those",
"though",
"three",
"through",
"throughout",
"thru",
"thus",
"to",
"together",
"too",
"toward",
"towards",
"twelve",
"twenty",
"two",
"under",
"unless",
"until",
"up",
"upon",
"used",
"using",
"various",
"very",
"via",
"was",
"we",
"well",
"were",
"what",
"whatever",
"when",
"whence",
"whenever",
"where",
"whereafter",
"whereas",
"whereby",
"wherein",
"whereupon",
"wherever",
"whether",
"which",
"while",
"whither",
"who",
"whoever",
"whole",
"whom",
"whose",
"why",
"will",
"with",
"within",
"without",
"would",
"yet",
"you",
"your",
"yours",
"yourself",
"yourselves",
"\u{2018}d",
"\u{2018}ll",
"\u{2018}m",
"\u{2018}re",
"\u{2018}s",
"\u{2018}ve",
"\u{2019}d",
"\u{2019}ll",
"\u{2019}m",
"\u{2019}re",
"\u{2019}s",
"\u{2019}ve",
];

200
src/nlp/document.rs Normal file
View File

@@ -0,0 +1,200 @@
use super::term::{Term, TermMetaData};
use super::utils::preprocess;
use scraper::{Html, Node, Selector};
use std::collections::HashMap;
/// data container representing a single document, in the nlp sense
#[derive(Debug, Default)]
pub struct Document {
/// collection of `Term`s and their associated metadata
pub terms: HashMap<Term, TermMetaData>,
/// number of terms contained within the document
number_of_terms: usize,
}
impl Document {
/// create a new `Document` from the given string
pub fn new(text: &str) -> Self {
let mut document = Self::default();
let processed = preprocess(text);
document.number_of_terms += processed.len();
for normalized in processed {
if normalized.len() > 2 {
document.add_term(&normalized)
}
}
document
}
/// add a `Term` to the document if it's not already tracked, otherwise increment the number
/// of times the term has been seen
pub fn add_term(&mut self, word: &str) {
let term = Term::new(word);
let metadata = self.terms.entry(term).or_insert_with(TermMetaData::new);
*metadata.count_mut() += 1;
}
/// create a new `Document` from the given HTML string
pub fn from_html(raw_html: &str) -> Self {
let selector = Selector::parse("body").unwrap();
let html = Html::parse_document(raw_html);
let text = html
.select(&selector)
.next()
.unwrap()
.descendants()
.filter_map(|node| {
if !node.value().is_text() && !node.value().is_comment() {
return None;
}
// have a Text||Comment node, trim whitespace to test for all whitespace stuff
let trimmed = if node.value().is_text() {
node.value().as_text().unwrap().text.trim()
} else {
node.value().as_comment().unwrap().comment.trim()
};
if trimmed.is_empty() {
return None;
}
// found a non-empty Text||Comment node, need to check its parent to determine if
// it's a <script>||<style> tag. We're assuming text within a script||style tag is
// uninteresting
let parent = node.parent().unwrap().value();
if !parent.is_element() {
return None;
}
// parent is an Element node, see if it's a <script> or <style>
if let Node::Element(element) = parent {
if element.name() == "script" || element.name() == "style" {
return None;
}
// at this point, we have a non-empty Text element with a non-script|style parent;
// now we can return the trimmed up string
return Some(format!("{} ", trimmed));
}
// not an Element node
None
})
.collect::<String>();
// call `new` to push the parsed html through the pre-processing pipeline and process all
// the words
Self::new(&text)
}
/// Log normalized weighting scheme for term frequency
pub fn term_frequency(&self, term: &Term) -> f32 {
if let Some(metadata) = self.terms.get(term) {
metadata.count() as f32 / self.number_of_terms() as f32
} else {
0.0
}
}
/// immutable reference to the collection of terms and their metadata
pub(super) fn terms(&self) -> &HashMap<Term, TermMetaData> {
&self.terms
}
/// number of terms the current document knows about
pub(super) fn number_of_terms(&self) -> usize {
self.number_of_terms
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
/// `Document::new` should preprocess text and generate a hashmap of `Term, TermMetadata`
fn nlp_document_creation_from_text() {
let doc = Document::new("The air quality in Singapore got worse on Wednesday.");
let expected_terms = ["air", "quality", "singapore", "worse", "wednesday"];
for expected in expected_terms {
let term = Term::new(expected);
assert!(doc.terms().contains_key(&term));
assert_eq!(doc.number_of_terms, 5);
assert_eq!(doc.terms().get(&term).unwrap().count(), 1);
// since term frequencies aren't calculated on `new`, document frequency is zero in
// addition to the empty term_frequencies slice
let empty: &[f32] = &[];
assert_eq!(doc.terms().get(&term).unwrap().term_frequencies(), empty);
assert_eq!(doc.terms().get(&term).unwrap().document_frequency(), 0);
}
}
#[test]
/// `Document::new` should preprocess html and generate a hashmap of `Term, TermMetadata`
fn nlp_document_creation_from_html() {
let empty = Document::from_html("<html></html>");
assert_eq!(empty.number_of_terms, 0);
let doc = Document::from_html(
"<html><body><p>The air quality in Singapore got worse on Wednesday.</p></body></html>",
);
let expected_terms = ["air", "quality", "singapore", "worse", "wednesday"];
for expected in expected_terms {
let term = Term::new(expected);
assert_eq!(doc.number_of_terms, 5);
assert!(doc.terms().contains_key(&term));
assert_eq!(doc.terms().get(&term).unwrap().count(), 1);
// since term frequencies aren't calculated on `new`, document frequency is zero in
// addition to the empty term_frequencies slice
let empty: &[f32] = &[];
assert_eq!(doc.terms().get(&term).unwrap().term_frequencies(), empty);
assert_eq!(doc.terms().get(&term).unwrap().document_frequency(), 0);
}
}
#[test]
/// simple check of the `term_frequency` function's return value
fn term_frequency_validation() {
let doc = Document::new("The air quality in Singapore got worse on Wednesday. Air Jordan.");
let air_freq = doc.term_frequency(&Term::new("air"));
let abs_diff = (air_freq - 0.2857143).abs();
assert!(abs_diff <= f32::EPSILON);
let non_existent = doc.term_frequency(&Term::new("derpatronic"));
assert_eq!(non_existent, 0.0);
}
#[test]
/// test accessors for correctness
fn document_accessor_test() {
let doc = Document::new("The air quality in Singapore got worse on Wednesday.");
let keys = doc.terms().keys().map(|key| key.raw()).collect::<Vec<_>>();
let expected = ["air", "quality", "singapore", "worse", "wednesday"];
assert_eq!(doc.number_of_terms(), 5);
for key in keys {
assert!(expected.contains(&key));
}
}
}

11
src/nlp/mod.rs Normal file
View File

@@ -0,0 +1,11 @@
//! small stand-alone tf-idf library, specifically designed for use in feroxbuster
mod constants;
mod document;
mod model;
mod term;
mod utils;
pub(crate) use self::document::Document;
pub(crate) use self::model::TfIdf;
pub(crate) use self::term::Term;

140
src/nlp/model.rs Normal file
View File

@@ -0,0 +1,140 @@
use super::document::Document;
use super::term::{Term, TermMetaData};
use super::utils::{inverse_document_frequency, tf_idf_score};
use std::borrow::{Borrow, BorrowMut};
use std::collections::HashMap;
/// data container for the TF-IDF model
#[derive(Debug, Default)]
pub(crate) struct TfIdf {
/// collection of `Term`s and their associated metadata
terms: HashMap<Term, TermMetaData>,
/// number of documents processed by the model
num_documents: usize,
}
impl TfIdf {
/// create an empty TF-IDF model; must be populated with `add_document` prior to use
fn new() -> Self {
Self {
terms: HashMap::new(),
num_documents: 0,
}
}
/// accessor method for the collection of `Term`s and `TermMetaData`
fn terms(&self) -> &HashMap<Term, TermMetaData> {
self.terms.borrow()
}
/// add a `Document` to the model
fn add_document(&mut self, document: Document) {
// increment number of docs seen, since we don't preserve the document itself; this needs
// to happen before calls to `self.inverse_document_frequency`, as it relies on the count
// being up to date
self.num_documents += 1;
for (term, doc_metadata) in document.terms().iter() {
// an incoming `Term` from a `Document` only has a valid `count` for that particular
// document; need to get the term frequency while both are known/valid
let term_frequency = document.term_frequency(term);
let metadata = self
.terms
.entry(term.clone())
.or_insert_with(|| doc_metadata.to_owned());
metadata.term_frequencies_mut().push(term_frequency);
}
}
/// (re)-calculate tf-idf scores for all terms, given the current number of documents
///
/// # Notes
///
/// old tf-idf scores are removed during calculations to keep new `Term`s at the same relative
/// level as new ones WRT corpus size
fn calculate_tf_idf_scores(&mut self) {
for metadata in self.terms.borrow_mut().values_mut() {
let num_frequencies = metadata.term_frequencies().len();
// clear out old scores before recalculating
metadata.tf_idf_scores_mut().clear();
let mut to_add = Vec::with_capacity(num_frequencies);
for frequency in metadata.term_frequencies() {
let idf = inverse_document_frequency(
self.num_documents as f32,
metadata.document_frequency() as f32,
);
let score = tf_idf_score(*frequency, idf);
to_add.push(score);
}
metadata.tf_idf_scores_mut().append(&mut to_add);
}
}
}
#[cfg(test)]
mod tests {
use super::*;
/// helper for this test suite
fn get_scores(word: &str, model: &TfIdf) -> Vec<f32> {
model
.terms()
.get(&Term::new(word))
.unwrap()
.tf_idf_scores()
.into()
}
#[test]
/// given the example data at https://remykarem.github.io/tfidf-demo/, ensure the model
/// produces the same results
fn model_generates_expected_tf_idf_scores() {
let one = "Air quality in the sunny island improved gradually throughout Wednesday.";
let two =
"Air quality in Singapore on Wednesday continued to get worse as haze hit the island.";
let three = "The air quality in Singapore is monitored through a network of air monitoring stations located in different parts of the island";
let four = "The air quality in Singapore got worse on Wednesday.";
let docs = [one, two, three, four];
let mut model = TfIdf::new();
for doc in docs.iter() {
let d = Document::new(doc);
model.add_document(d);
}
assert_eq!(model.terms().len(), 19);
model.calculate_tf_idf_scores();
assert_eq!(get_scores("quality", &model), [0.0, 0.0, 0.0, 0.0]);
assert_eq!(get_scores("air", &model), [0.0, 0.0, 0.0, 0.0]);
assert_eq!(
get_scores("wednesday", &model),
[0.017848395, 0.013882084, 0.024987752]
);
assert_eq!(
get_scores("island", &model),
[0.017848395, 0.013882084, 0.010411563]
);
assert_eq!(
get_scores("singapore", &model),
[0.013882084, 0.010411563, 0.024987752]
);
assert_eq!(get_scores("sunny", &model), [0.08600858]);
assert_eq!(get_scores("monitoring", &model), [0.05017167]);
assert_eq!(get_scores("stations", &model), [0.05017167]);
assert_eq!(get_scores("parts", &model), [0.05017167]);
assert_eq!(get_scores("haze", &model), [0.06689556]);
assert_eq!(get_scores("hit", &model), [0.06689556]);
assert_eq!(get_scores("worse", &model), [0.03344778, 0.060206003]);
}
}

105
src/nlp/term.rs Normal file
View File

@@ -0,0 +1,105 @@
use std::borrow::BorrowMut;
/// single word term for text processing
#[derive(Debug, Hash, Eq, PartialEq, Default, Clone)]
pub struct Term {
/// underlying string that the term represents
raw: String,
}
impl Term {
/// given a word, create a new `Term`
pub fn new(word: &str) -> Self {
Self {
raw: word.to_owned(),
}
}
/// return a reference to the underlying string
pub fn raw(&self) -> &str {
&self.raw
}
}
/// metadata to be associated with a `Term`
#[derive(Debug, Clone, Default)]
pub struct TermMetaData {
/// number of times the associated `Term` was seen in a single document
count: u32,
/// collection of term frequencies for the associated `Term`
term_frequencies: Vec<f32>,
/// collection of tf-idf scores for the associated `Term`
tf_idf_scores: Vec<f32>,
}
impl TermMetaData {
/// create a new metadata container
pub fn new() -> Self {
Self::default()
}
/// number of times a `Term` has appeared in any `Document` within the corpus
pub fn document_frequency(&self) -> usize {
self.term_frequencies().len()
}
/// mutable reference to the collection of term frequencies
pub fn term_frequencies_mut(&mut self) -> &mut Vec<f32> {
self.term_frequencies.borrow_mut()
}
/// immutable reference to the collection of term frequencies
pub fn term_frequencies(&self) -> &[f32] {
&self.term_frequencies
}
/// mutable reference to the number of times a `Term` was seen in a particular `Document`
pub fn count_mut(&mut self) -> &mut u32 {
self.count.borrow_mut()
}
/// number of times a `Term` was seen in a particular `Document`
pub fn count(&self) -> u32 {
self.count
}
/// mutable reference to the collection of tf-idf scores
pub fn tf_idf_scores_mut(&mut self) -> &mut Vec<f32> {
self.tf_idf_scores.borrow_mut()
}
/// immutable reference to the collection of tf-idf scores
pub fn tf_idf_scores(&self) -> &[f32] {
&self.tf_idf_scores
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
/// test accessors for correctness
fn nlp_term_accessor_test() {
let term = Term::new("stuff");
assert_eq!(term.raw(), "stuff");
}
#[test]
/// test accessors for correctness
fn nlp_term_metadata_accessor_test() {
let mut metadata = TermMetaData::new();
*metadata.count_mut() += 1;
assert_eq!(metadata.count(), 1);
metadata.term_frequencies_mut().push(1.0);
assert_eq!(metadata.document_frequency(), 1);
assert_eq!(metadata.term_frequencies().first().unwrap(), &1.0);
metadata.tf_idf_scores_mut().push(1.0);
assert_eq!(metadata.tf_idf_scores().first().unwrap(), &1.0);
}
}

158
src/nlp/utils.rs Normal file
View File

@@ -0,0 +1,158 @@
use super::constants::{BOUNDED_WORD_REGEX, STOP_WORDS};
use regex::Captures;
use std::borrow::Cow;
/// pre-processing pipeline wrapper that removes punctuation, normalizes word case (utf-8 included)
/// to lowercase, and remove stop words
pub fn preprocess(text: &str) -> Vec<String> {
let text = remove_punctuation(text);
let text = normalize_case(text);
let text = remove_stop_words(&text);
text.split_whitespace()
.map(|word| word.to_string())
.collect::<Vec<_>>()
}
/// optimized version of `str::to_lowercase`
pub fn normalize_case<'a, S: Into<Cow<'a, str>>>(input: S) -> Cow<'a, str> {
let input = input.into();
let first = input.find(char::is_uppercase);
if let Some(first_idx) = first {
let mut output = String::from(&input[..first_idx]);
output.reserve(input.len() - first_idx);
for c in input[first_idx..].chars() {
if c.is_uppercase() {
output.push(c.to_lowercase().next().unwrap())
} else {
output.push(c)
}
}
Cow::Owned(output)
} else {
input
}
}
/// remove ascii and some utf-8 punctuation characters from the given string
pub fn remove_punctuation(text: &str) -> String {
// non-separator type chars can be replaced with an empty string, while separators are replaced
// with a space. This attempts to keep things like
// 'aboutblogfaqcontactpresstermslexicondisclosure' from happening
text.replace(
[
'!', '\\', '"', '#', '$', '%', '&', '(', ')', '*', '+', ':', ';', '<', '=', '>', '?',
'@', '[', ']', '^', '{', '}', '|', '~', ',', '\'', '“', '”', '', '', '', '',
],
"",
)
.replace(['/', '', '—', '.'], " ")
}
/// remove stop words from the given string
pub fn remove_stop_words(text: &str) -> String {
BOUNDED_WORD_REGEX
.replace_all(text, |caps: &Captures| {
let word = &caps[0];
if !STOP_WORDS.contains(&word) {
word.to_owned()
} else {
String::new()
}
})
.into()
}
/// calculate inverse document frequency
pub fn inverse_document_frequency(num_docs: f32, doc_frequency: f32) -> f32 {
f32::log10(num_docs / doc_frequency)
}
/// calculate term frequency-inverse document frequency (tf-idf)
pub fn tf_idf_score(term_frequency: f32, idf: f32) -> f32 {
term_frequency * idf
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
/// ensure all expected punctuation characters are removed
fn test_remove_punctuation() {
let tester = "!\\\"#$%&()*+/:;<=>?@[]^{}|~,.'“”’‘–—\n";
// the `" \n"` is because of the things like / getting replaced with a space
assert_eq!(remove_punctuation(tester), " \n");
}
#[test]
/// ensure uppercase characters are swapped to lowercase
fn test_normalize_case() {
let tester = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
assert_eq!(normalize_case(tester), "abcdefghijklmnopqrstuvwxyz");
}
#[test]
/// ensure all stop words are removed from the list of stopwords ... intestuous
fn test_remove_stopwords() {
let all_words = STOP_WORDS
.iter()
.map(|&word| word.to_string())
.collect::<Vec<_>>()
.join(" ");
let removed = remove_stop_words(&all_words).replace(' ', "");
// the remaining chars are from the contraction-based stop words
assert_eq!(removed, "'d'll'm''s'ven'tntntdllmsvedllmsve");
}
#[test]
/// ensure preprocess
fn test_preprocess_results() {
let tester = "WHY are Y'all YELLing?";
assert_eq!(&preprocess(tester), &["yall", "yelling"]);
}
#[test]
/// ensure our calculations conform to the example provided at the link below
///
/// https://www.kaggle.com/paulrohan2020/tf-idf-tutorial/notebook#TF-IDF-Model
///
/// Consider a document containing 100 words wherein the word cat appears 3 times.
/// The term frequency (i.e., tf) for cat is then (3 / 100) = 0.03. Now, assume we have 10
/// million documents and the word cat appears in one thousand of these. Then, the inverse
/// document frequency (i.e., idf) is calculated as log(10,000,000 / 1,000) = 4. Thus, the
/// Tf-idf weight is the product of these quantities: 0.03 * 4 = 0.12.
fn idf_returns_expected_value() {
let num_docs = 10_000_000_f32;
let num_occurrences = 1_000_f32;
let abs_diff = (inverse_document_frequency(num_docs, num_occurrences) - 4.0).abs();
assert!(abs_diff <= f32::EPSILON);
}
#[test]
/// ensure our calculations conform to the example provided at the link below
///
/// https://www.kaggle.com/paulrohan2020/tf-idf-tutorial/notebook#TF-IDF-Model
///
/// Consider a document containing 100 words wherein the word cat appears 3 times.
/// The term frequency (i.e., tf) for cat is then (3 / 100) = 0.03. Now, assume we have 10
/// million documents and the word cat appears in one thousand of these. Then, the inverse
/// document frequency (i.e., idf) is calculated as log(10,000,000 / 1,000) = 4. Thus, the
/// Tf-idf weight is the product of these quantities: 0.03 * 4 = 0.12.
fn tf_idf_returns_expected_value() {
let term_freq = 0.03_f32;
let num_docs = 10_000_000_f32;
let num_occurrences = 1_000_f32;
let idf = inverse_document_frequency(num_docs, num_occurrences);
let abs_diff = (tf_idf_score(term_freq, idf) - 0.12).abs();
assert!(abs_diff <= f32::EPSILON);
}
}