mirror of
https://github.com/epi052/feroxbuster.git
synced 2026-06-07 10:01:12 -03:00
added nlp module
This commit is contained in:
@@ -30,6 +30,7 @@ mod macros;
|
||||
mod url;
|
||||
mod response;
|
||||
mod message;
|
||||
mod nlp;
|
||||
|
||||
/// Alias for tokio::sync::mpsc::UnboundedSender<Command>
|
||||
pub(crate) type CommandSender = UnboundedSender<Command>;
|
||||
|
||||
334
src/nlp/constants.rs
Normal file
334
src/nlp/constants.rs
Normal file
@@ -0,0 +1,334 @@
|
||||
use lazy_static::lazy_static;
|
||||
use regex::Regex;
|
||||
|
||||
lazy_static! {
|
||||
/// regular expression to match on words with numbers, underscores, and hyphens
|
||||
pub(super) static ref BOUNDED_WORD_REGEX: Regex = Regex::new(r"\b[a-zA-Z0-9_-]+\b").unwrap();
|
||||
}
|
||||
|
||||
/// collection of stop words from spaCy with small modifications
|
||||
pub(super) static STOP_WORDS: [&str; 323] = [
|
||||
"'d",
|
||||
"'ll",
|
||||
"'m",
|
||||
"'re",
|
||||
"'s",
|
||||
"'ve",
|
||||
"a",
|
||||
"about",
|
||||
"above",
|
||||
"across",
|
||||
"after",
|
||||
"afterwards",
|
||||
"again",
|
||||
"against",
|
||||
"almost",
|
||||
"alone",
|
||||
"along",
|
||||
"already",
|
||||
"also",
|
||||
"although",
|
||||
"always",
|
||||
"am",
|
||||
"among",
|
||||
"amongst",
|
||||
"amount",
|
||||
"an",
|
||||
"and",
|
||||
"another",
|
||||
"any",
|
||||
"anyhow",
|
||||
"anyone",
|
||||
"anything",
|
||||
"anyway",
|
||||
"anywhere",
|
||||
"are",
|
||||
"around",
|
||||
"as",
|
||||
"at",
|
||||
"back",
|
||||
"be",
|
||||
"became",
|
||||
"because",
|
||||
"become",
|
||||
"becomes",
|
||||
"becoming",
|
||||
"been",
|
||||
"before",
|
||||
"beforehand",
|
||||
"behind",
|
||||
"being",
|
||||
"below",
|
||||
"beside",
|
||||
"besides",
|
||||
"between",
|
||||
"beyond",
|
||||
"both",
|
||||
"bottom",
|
||||
"but",
|
||||
"by",
|
||||
"ca",
|
||||
"call",
|
||||
"can",
|
||||
"cannot",
|
||||
"could",
|
||||
"did",
|
||||
"do",
|
||||
"does",
|
||||
"doing",
|
||||
"done",
|
||||
"down",
|
||||
"due",
|
||||
"during",
|
||||
"each",
|
||||
"eight",
|
||||
"either",
|
||||
"eleven",
|
||||
"else",
|
||||
"elsewhere",
|
||||
"empty",
|
||||
"enough",
|
||||
"even",
|
||||
"ever",
|
||||
"every",
|
||||
"everyone",
|
||||
"everything",
|
||||
"everywhere",
|
||||
"except",
|
||||
"few",
|
||||
"fifteen",
|
||||
"fifty",
|
||||
"first",
|
||||
"five",
|
||||
"for",
|
||||
"former",
|
||||
"formerly",
|
||||
"forty",
|
||||
"four",
|
||||
"from",
|
||||
"front",
|
||||
"full",
|
||||
"further",
|
||||
"get",
|
||||
"got",
|
||||
"give",
|
||||
"go",
|
||||
"had",
|
||||
"has",
|
||||
"have",
|
||||
"he",
|
||||
"hence",
|
||||
"her",
|
||||
"here",
|
||||
"hereafter",
|
||||
"hereby",
|
||||
"herein",
|
||||
"hereupon",
|
||||
"hers",
|
||||
"herself",
|
||||
"him",
|
||||
"himself",
|
||||
"his",
|
||||
"how",
|
||||
"however",
|
||||
"hundred",
|
||||
"i",
|
||||
"if",
|
||||
"in",
|
||||
"indeed",
|
||||
"into",
|
||||
"is",
|
||||
"it",
|
||||
"its",
|
||||
"itself",
|
||||
"just",
|
||||
"keep",
|
||||
"last",
|
||||
"latter",
|
||||
"latterly",
|
||||
"least",
|
||||
"less",
|
||||
"made",
|
||||
"make",
|
||||
"many",
|
||||
"may",
|
||||
"me",
|
||||
"meanwhile",
|
||||
"might",
|
||||
"mine",
|
||||
"more",
|
||||
"moreover",
|
||||
"most",
|
||||
"mostly",
|
||||
"move",
|
||||
"much",
|
||||
"must",
|
||||
"my",
|
||||
"myself",
|
||||
"n't",
|
||||
"name",
|
||||
"namely",
|
||||
"neither",
|
||||
"never",
|
||||
"nevertheless",
|
||||
"next",
|
||||
"nine",
|
||||
"no",
|
||||
"nobody",
|
||||
"none",
|
||||
"noone",
|
||||
"nor",
|
||||
"not",
|
||||
"nothing",
|
||||
"now",
|
||||
"nowhere",
|
||||
"n\u{2018}t",
|
||||
"n\u{2019}t",
|
||||
"of",
|
||||
"off",
|
||||
"often",
|
||||
"on",
|
||||
"once",
|
||||
"one",
|
||||
"only",
|
||||
"onto",
|
||||
"or",
|
||||
"other",
|
||||
"others",
|
||||
"otherwise",
|
||||
"our",
|
||||
"ours",
|
||||
"ourselves",
|
||||
"out",
|
||||
"over",
|
||||
"own",
|
||||
"part",
|
||||
"per",
|
||||
"perhaps",
|
||||
"please",
|
||||
"put",
|
||||
"quite",
|
||||
"rather",
|
||||
"re",
|
||||
"really",
|
||||
"regarding",
|
||||
"same",
|
||||
"say",
|
||||
"see",
|
||||
"seem",
|
||||
"seemed",
|
||||
"seeming",
|
||||
"seems",
|
||||
"serious",
|
||||
"several",
|
||||
"she",
|
||||
"should",
|
||||
"side",
|
||||
"since",
|
||||
"six",
|
||||
"sixty",
|
||||
"so",
|
||||
"some",
|
||||
"somehow",
|
||||
"someone",
|
||||
"something",
|
||||
"sometime",
|
||||
"sometimes",
|
||||
"somewhere",
|
||||
"still",
|
||||
"such",
|
||||
"take",
|
||||
"ten",
|
||||
"than",
|
||||
"that",
|
||||
"the",
|
||||
"their",
|
||||
"them",
|
||||
"themselves",
|
||||
"then",
|
||||
"thence",
|
||||
"there",
|
||||
"thereafter",
|
||||
"thereby",
|
||||
"therefore",
|
||||
"therein",
|
||||
"thereupon",
|
||||
"these",
|
||||
"they",
|
||||
"third",
|
||||
"this",
|
||||
"those",
|
||||
"though",
|
||||
"three",
|
||||
"through",
|
||||
"throughout",
|
||||
"thru",
|
||||
"thus",
|
||||
"to",
|
||||
"together",
|
||||
"too",
|
||||
"toward",
|
||||
"towards",
|
||||
"twelve",
|
||||
"twenty",
|
||||
"two",
|
||||
"under",
|
||||
"unless",
|
||||
"until",
|
||||
"up",
|
||||
"upon",
|
||||
"used",
|
||||
"using",
|
||||
"various",
|
||||
"very",
|
||||
"via",
|
||||
"was",
|
||||
"we",
|
||||
"well",
|
||||
"were",
|
||||
"what",
|
||||
"whatever",
|
||||
"when",
|
||||
"whence",
|
||||
"whenever",
|
||||
"where",
|
||||
"whereafter",
|
||||
"whereas",
|
||||
"whereby",
|
||||
"wherein",
|
||||
"whereupon",
|
||||
"wherever",
|
||||
"whether",
|
||||
"which",
|
||||
"while",
|
||||
"whither",
|
||||
"who",
|
||||
"whoever",
|
||||
"whole",
|
||||
"whom",
|
||||
"whose",
|
||||
"why",
|
||||
"will",
|
||||
"with",
|
||||
"within",
|
||||
"without",
|
||||
"would",
|
||||
"yet",
|
||||
"you",
|
||||
"your",
|
||||
"yours",
|
||||
"yourself",
|
||||
"yourselves",
|
||||
"\u{2018}d",
|
||||
"\u{2018}ll",
|
||||
"\u{2018}m",
|
||||
"\u{2018}re",
|
||||
"\u{2018}s",
|
||||
"\u{2018}ve",
|
||||
"\u{2019}d",
|
||||
"\u{2019}ll",
|
||||
"\u{2019}m",
|
||||
"\u{2019}re",
|
||||
"\u{2019}s",
|
||||
"\u{2019}ve",
|
||||
];
|
||||
200
src/nlp/document.rs
Normal file
200
src/nlp/document.rs
Normal file
@@ -0,0 +1,200 @@
|
||||
use super::term::{Term, TermMetaData};
|
||||
use super::utils::preprocess;
|
||||
use scraper::{Html, Node, Selector};
|
||||
use std::collections::HashMap;
|
||||
|
||||
/// data container representing a single document, in the nlp sense
|
||||
#[derive(Debug, Default)]
|
||||
pub struct Document {
|
||||
/// collection of `Term`s and their associated metadata
|
||||
pub terms: HashMap<Term, TermMetaData>,
|
||||
|
||||
/// number of terms contained within the document
|
||||
number_of_terms: usize,
|
||||
}
|
||||
|
||||
impl Document {
|
||||
/// create a new `Document` from the given string
|
||||
pub fn new(text: &str) -> Self {
|
||||
let mut document = Self::default();
|
||||
|
||||
let processed = preprocess(text);
|
||||
|
||||
document.number_of_terms += processed.len();
|
||||
|
||||
for normalized in processed {
|
||||
if normalized.len() > 2 {
|
||||
document.add_term(&normalized)
|
||||
}
|
||||
}
|
||||
document
|
||||
}
|
||||
|
||||
/// add a `Term` to the document if it's not already tracked, otherwise increment the number
|
||||
/// of times the term has been seen
|
||||
pub fn add_term(&mut self, word: &str) {
|
||||
let term = Term::new(word);
|
||||
|
||||
let metadata = self.terms.entry(term).or_insert_with(TermMetaData::new);
|
||||
*metadata.count_mut() += 1;
|
||||
}
|
||||
|
||||
/// create a new `Document` from the given HTML string
|
||||
pub fn from_html(raw_html: &str) -> Self {
|
||||
let selector = Selector::parse("body").unwrap();
|
||||
|
||||
let html = Html::parse_document(raw_html);
|
||||
|
||||
let text = html
|
||||
.select(&selector)
|
||||
.next()
|
||||
.unwrap()
|
||||
.descendants()
|
||||
.filter_map(|node| {
|
||||
if !node.value().is_text() && !node.value().is_comment() {
|
||||
return None;
|
||||
}
|
||||
|
||||
// have a Text||Comment node, trim whitespace to test for all whitespace stuff
|
||||
let trimmed = if node.value().is_text() {
|
||||
node.value().as_text().unwrap().text.trim()
|
||||
} else {
|
||||
node.value().as_comment().unwrap().comment.trim()
|
||||
};
|
||||
|
||||
if trimmed.is_empty() {
|
||||
return None;
|
||||
}
|
||||
|
||||
// found a non-empty Text||Comment node, need to check its parent to determine if
|
||||
// it's a <script>||<style> tag. We're assuming text within a script||style tag is
|
||||
// uninteresting
|
||||
|
||||
let parent = node.parent().unwrap().value();
|
||||
|
||||
if !parent.is_element() {
|
||||
return None;
|
||||
}
|
||||
|
||||
// parent is an Element node, see if it's a <script> or <style>
|
||||
|
||||
if let Node::Element(element) = parent {
|
||||
if element.name() == "script" || element.name() == "style" {
|
||||
return None;
|
||||
}
|
||||
|
||||
// at this point, we have a non-empty Text element with a non-script|style parent;
|
||||
// now we can return the trimmed up string
|
||||
return Some(format!("{} ", trimmed));
|
||||
}
|
||||
|
||||
// not an Element node
|
||||
None
|
||||
})
|
||||
.collect::<String>();
|
||||
|
||||
// call `new` to push the parsed html through the pre-processing pipeline and process all
|
||||
// the words
|
||||
Self::new(&text)
|
||||
}
|
||||
|
||||
/// Log normalized weighting scheme for term frequency
|
||||
pub fn term_frequency(&self, term: &Term) -> f32 {
|
||||
if let Some(metadata) = self.terms.get(term) {
|
||||
metadata.count() as f32 / self.number_of_terms() as f32
|
||||
} else {
|
||||
0.0
|
||||
}
|
||||
}
|
||||
|
||||
/// immutable reference to the collection of terms and their metadata
|
||||
pub(super) fn terms(&self) -> &HashMap<Term, TermMetaData> {
|
||||
&self.terms
|
||||
}
|
||||
|
||||
/// number of terms the current document knows about
|
||||
pub(super) fn number_of_terms(&self) -> usize {
|
||||
self.number_of_terms
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
/// `Document::new` should preprocess text and generate a hashmap of `Term, TermMetadata`
|
||||
fn nlp_document_creation_from_text() {
|
||||
let doc = Document::new("The air quality in Singapore got worse on Wednesday.");
|
||||
|
||||
let expected_terms = ["air", "quality", "singapore", "worse", "wednesday"];
|
||||
|
||||
for expected in expected_terms {
|
||||
let term = Term::new(expected);
|
||||
assert!(doc.terms().contains_key(&term));
|
||||
assert_eq!(doc.number_of_terms, 5);
|
||||
assert_eq!(doc.terms().get(&term).unwrap().count(), 1);
|
||||
|
||||
// since term frequencies aren't calculated on `new`, document frequency is zero in
|
||||
// addition to the empty term_frequencies slice
|
||||
let empty: &[f32] = &[];
|
||||
assert_eq!(doc.terms().get(&term).unwrap().term_frequencies(), empty);
|
||||
assert_eq!(doc.terms().get(&term).unwrap().document_frequency(), 0);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
/// `Document::new` should preprocess html and generate a hashmap of `Term, TermMetadata`
|
||||
fn nlp_document_creation_from_html() {
|
||||
let empty = Document::from_html("<html></html>");
|
||||
assert_eq!(empty.number_of_terms, 0);
|
||||
|
||||
let doc = Document::from_html(
|
||||
"<html><body><p>The air quality in Singapore got worse on Wednesday.</p></body></html>",
|
||||
);
|
||||
|
||||
let expected_terms = ["air", "quality", "singapore", "worse", "wednesday"];
|
||||
|
||||
for expected in expected_terms {
|
||||
let term = Term::new(expected);
|
||||
assert_eq!(doc.number_of_terms, 5);
|
||||
assert!(doc.terms().contains_key(&term));
|
||||
assert_eq!(doc.terms().get(&term).unwrap().count(), 1);
|
||||
|
||||
// since term frequencies aren't calculated on `new`, document frequency is zero in
|
||||
// addition to the empty term_frequencies slice
|
||||
let empty: &[f32] = &[];
|
||||
assert_eq!(doc.terms().get(&term).unwrap().term_frequencies(), empty);
|
||||
assert_eq!(doc.terms().get(&term).unwrap().document_frequency(), 0);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
/// simple check of the `term_frequency` function's return value
|
||||
fn term_frequency_validation() {
|
||||
let doc = Document::new("The air quality in Singapore got worse on Wednesday. Air Jordan.");
|
||||
|
||||
let air_freq = doc.term_frequency(&Term::new("air"));
|
||||
|
||||
let abs_diff = (air_freq - 0.2857143).abs();
|
||||
assert!(abs_diff <= f32::EPSILON);
|
||||
|
||||
let non_existent = doc.term_frequency(&Term::new("derpatronic"));
|
||||
assert_eq!(non_existent, 0.0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
/// test accessors for correctness
|
||||
fn document_accessor_test() {
|
||||
let doc = Document::new("The air quality in Singapore got worse on Wednesday.");
|
||||
let keys = doc.terms().keys().map(|key| key.raw()).collect::<Vec<_>>();
|
||||
|
||||
let expected = ["air", "quality", "singapore", "worse", "wednesday"];
|
||||
|
||||
assert_eq!(doc.number_of_terms(), 5);
|
||||
|
||||
for key in keys {
|
||||
assert!(expected.contains(&key));
|
||||
}
|
||||
}
|
||||
}
|
||||
11
src/nlp/mod.rs
Normal file
11
src/nlp/mod.rs
Normal file
@@ -0,0 +1,11 @@
|
||||
//! small stand-alone tf-idf library, specifically designed for use in feroxbuster
|
||||
|
||||
mod constants;
|
||||
mod document;
|
||||
mod model;
|
||||
mod term;
|
||||
mod utils;
|
||||
|
||||
pub(crate) use self::document::Document;
|
||||
pub(crate) use self::model::TfIdf;
|
||||
pub(crate) use self::term::Term;
|
||||
140
src/nlp/model.rs
Normal file
140
src/nlp/model.rs
Normal file
@@ -0,0 +1,140 @@
|
||||
use super::document::Document;
|
||||
use super::term::{Term, TermMetaData};
|
||||
use super::utils::{inverse_document_frequency, tf_idf_score};
|
||||
use std::borrow::{Borrow, BorrowMut};
|
||||
use std::collections::HashMap;
|
||||
|
||||
/// data container for the TF-IDF model
|
||||
#[derive(Debug, Default)]
|
||||
pub(crate) struct TfIdf {
|
||||
/// collection of `Term`s and their associated metadata
|
||||
terms: HashMap<Term, TermMetaData>,
|
||||
|
||||
/// number of documents processed by the model
|
||||
num_documents: usize,
|
||||
}
|
||||
|
||||
impl TfIdf {
|
||||
/// create an empty TF-IDF model; must be populated with `add_document` prior to use
|
||||
fn new() -> Self {
|
||||
Self {
|
||||
terms: HashMap::new(),
|
||||
num_documents: 0,
|
||||
}
|
||||
}
|
||||
|
||||
/// accessor method for the collection of `Term`s and `TermMetaData`
|
||||
fn terms(&self) -> &HashMap<Term, TermMetaData> {
|
||||
self.terms.borrow()
|
||||
}
|
||||
|
||||
/// add a `Document` to the model
|
||||
fn add_document(&mut self, document: Document) {
|
||||
// increment number of docs seen, since we don't preserve the document itself; this needs
|
||||
// to happen before calls to `self.inverse_document_frequency`, as it relies on the count
|
||||
// being up to date
|
||||
self.num_documents += 1;
|
||||
|
||||
for (term, doc_metadata) in document.terms().iter() {
|
||||
// an incoming `Term` from a `Document` only has a valid `count` for that particular
|
||||
// document; need to get the term frequency while both are known/valid
|
||||
let term_frequency = document.term_frequency(term);
|
||||
|
||||
let metadata = self
|
||||
.terms
|
||||
.entry(term.clone())
|
||||
.or_insert_with(|| doc_metadata.to_owned());
|
||||
|
||||
metadata.term_frequencies_mut().push(term_frequency);
|
||||
}
|
||||
}
|
||||
|
||||
/// (re)-calculate tf-idf scores for all terms, given the current number of documents
|
||||
///
|
||||
/// # Notes
|
||||
///
|
||||
/// old tf-idf scores are removed during calculations to keep new `Term`s at the same relative
|
||||
/// level as new ones WRT corpus size
|
||||
fn calculate_tf_idf_scores(&mut self) {
|
||||
for metadata in self.terms.borrow_mut().values_mut() {
|
||||
let num_frequencies = metadata.term_frequencies().len();
|
||||
|
||||
// clear out old scores before recalculating
|
||||
metadata.tf_idf_scores_mut().clear();
|
||||
|
||||
let mut to_add = Vec::with_capacity(num_frequencies);
|
||||
|
||||
for frequency in metadata.term_frequencies() {
|
||||
let idf = inverse_document_frequency(
|
||||
self.num_documents as f32,
|
||||
metadata.document_frequency() as f32,
|
||||
);
|
||||
|
||||
let score = tf_idf_score(*frequency, idf);
|
||||
to_add.push(score);
|
||||
}
|
||||
|
||||
metadata.tf_idf_scores_mut().append(&mut to_add);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
/// helper for this test suite
|
||||
fn get_scores(word: &str, model: &TfIdf) -> Vec<f32> {
|
||||
model
|
||||
.terms()
|
||||
.get(&Term::new(word))
|
||||
.unwrap()
|
||||
.tf_idf_scores()
|
||||
.into()
|
||||
}
|
||||
|
||||
#[test]
|
||||
/// given the example data at https://remykarem.github.io/tfidf-demo/, ensure the model
|
||||
/// produces the same results
|
||||
fn model_generates_expected_tf_idf_scores() {
|
||||
let one = "Air quality in the sunny island improved gradually throughout Wednesday.";
|
||||
let two =
|
||||
"Air quality in Singapore on Wednesday continued to get worse as haze hit the island.";
|
||||
let three = "The air quality in Singapore is monitored through a network of air monitoring stations located in different parts of the island";
|
||||
let four = "The air quality in Singapore got worse on Wednesday.";
|
||||
|
||||
let docs = [one, two, three, four];
|
||||
let mut model = TfIdf::new();
|
||||
|
||||
for doc in docs.iter() {
|
||||
let d = Document::new(doc);
|
||||
model.add_document(d);
|
||||
}
|
||||
|
||||
assert_eq!(model.terms().len(), 19);
|
||||
|
||||
model.calculate_tf_idf_scores();
|
||||
|
||||
assert_eq!(get_scores("quality", &model), [0.0, 0.0, 0.0, 0.0]);
|
||||
assert_eq!(get_scores("air", &model), [0.0, 0.0, 0.0, 0.0]);
|
||||
assert_eq!(
|
||||
get_scores("wednesday", &model),
|
||||
[0.017848395, 0.013882084, 0.024987752]
|
||||
);
|
||||
assert_eq!(
|
||||
get_scores("island", &model),
|
||||
[0.017848395, 0.013882084, 0.010411563]
|
||||
);
|
||||
assert_eq!(
|
||||
get_scores("singapore", &model),
|
||||
[0.013882084, 0.010411563, 0.024987752]
|
||||
);
|
||||
assert_eq!(get_scores("sunny", &model), [0.08600858]);
|
||||
assert_eq!(get_scores("monitoring", &model), [0.05017167]);
|
||||
assert_eq!(get_scores("stations", &model), [0.05017167]);
|
||||
assert_eq!(get_scores("parts", &model), [0.05017167]);
|
||||
assert_eq!(get_scores("haze", &model), [0.06689556]);
|
||||
assert_eq!(get_scores("hit", &model), [0.06689556]);
|
||||
assert_eq!(get_scores("worse", &model), [0.03344778, 0.060206003]);
|
||||
}
|
||||
}
|
||||
105
src/nlp/term.rs
Normal file
105
src/nlp/term.rs
Normal file
@@ -0,0 +1,105 @@
|
||||
use std::borrow::BorrowMut;
|
||||
|
||||
/// single word term for text processing
|
||||
#[derive(Debug, Hash, Eq, PartialEq, Default, Clone)]
|
||||
pub struct Term {
|
||||
/// underlying string that the term represents
|
||||
raw: String,
|
||||
}
|
||||
|
||||
impl Term {
|
||||
/// given a word, create a new `Term`
|
||||
pub fn new(word: &str) -> Self {
|
||||
Self {
|
||||
raw: word.to_owned(),
|
||||
}
|
||||
}
|
||||
|
||||
/// return a reference to the underlying string
|
||||
pub fn raw(&self) -> &str {
|
||||
&self.raw
|
||||
}
|
||||
}
|
||||
|
||||
/// metadata to be associated with a `Term`
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct TermMetaData {
|
||||
/// number of times the associated `Term` was seen in a single document
|
||||
count: u32,
|
||||
|
||||
/// collection of term frequencies for the associated `Term`
|
||||
term_frequencies: Vec<f32>,
|
||||
|
||||
/// collection of tf-idf scores for the associated `Term`
|
||||
tf_idf_scores: Vec<f32>,
|
||||
}
|
||||
|
||||
impl TermMetaData {
|
||||
/// create a new metadata container
|
||||
pub fn new() -> Self {
|
||||
Self::default()
|
||||
}
|
||||
|
||||
/// number of times a `Term` has appeared in any `Document` within the corpus
|
||||
pub fn document_frequency(&self) -> usize {
|
||||
self.term_frequencies().len()
|
||||
}
|
||||
|
||||
/// mutable reference to the collection of term frequencies
|
||||
pub fn term_frequencies_mut(&mut self) -> &mut Vec<f32> {
|
||||
self.term_frequencies.borrow_mut()
|
||||
}
|
||||
|
||||
/// immutable reference to the collection of term frequencies
|
||||
pub fn term_frequencies(&self) -> &[f32] {
|
||||
&self.term_frequencies
|
||||
}
|
||||
|
||||
/// mutable reference to the number of times a `Term` was seen in a particular `Document`
|
||||
pub fn count_mut(&mut self) -> &mut u32 {
|
||||
self.count.borrow_mut()
|
||||
}
|
||||
|
||||
/// number of times a `Term` was seen in a particular `Document`
|
||||
pub fn count(&self) -> u32 {
|
||||
self.count
|
||||
}
|
||||
|
||||
/// mutable reference to the collection of tf-idf scores
|
||||
pub fn tf_idf_scores_mut(&mut self) -> &mut Vec<f32> {
|
||||
self.tf_idf_scores.borrow_mut()
|
||||
}
|
||||
|
||||
/// immutable reference to the collection of tf-idf scores
|
||||
pub fn tf_idf_scores(&self) -> &[f32] {
|
||||
&self.tf_idf_scores
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
/// test accessors for correctness
|
||||
fn nlp_term_accessor_test() {
|
||||
let term = Term::new("stuff");
|
||||
assert_eq!(term.raw(), "stuff");
|
||||
}
|
||||
|
||||
#[test]
|
||||
/// test accessors for correctness
|
||||
fn nlp_term_metadata_accessor_test() {
|
||||
let mut metadata = TermMetaData::new();
|
||||
|
||||
*metadata.count_mut() += 1;
|
||||
assert_eq!(metadata.count(), 1);
|
||||
|
||||
metadata.term_frequencies_mut().push(1.0);
|
||||
assert_eq!(metadata.document_frequency(), 1);
|
||||
assert_eq!(metadata.term_frequencies().first().unwrap(), &1.0);
|
||||
|
||||
metadata.tf_idf_scores_mut().push(1.0);
|
||||
assert_eq!(metadata.tf_idf_scores().first().unwrap(), &1.0);
|
||||
}
|
||||
}
|
||||
158
src/nlp/utils.rs
Normal file
158
src/nlp/utils.rs
Normal file
@@ -0,0 +1,158 @@
|
||||
use super::constants::{BOUNDED_WORD_REGEX, STOP_WORDS};
|
||||
use regex::Captures;
|
||||
use std::borrow::Cow;
|
||||
|
||||
/// pre-processing pipeline wrapper that removes punctuation, normalizes word case (utf-8 included)
|
||||
/// to lowercase, and remove stop words
|
||||
pub fn preprocess(text: &str) -> Vec<String> {
|
||||
let text = remove_punctuation(text);
|
||||
let text = normalize_case(text);
|
||||
let text = remove_stop_words(&text);
|
||||
|
||||
text.split_whitespace()
|
||||
.map(|word| word.to_string())
|
||||
.collect::<Vec<_>>()
|
||||
}
|
||||
|
||||
/// optimized version of `str::to_lowercase`
|
||||
pub fn normalize_case<'a, S: Into<Cow<'a, str>>>(input: S) -> Cow<'a, str> {
|
||||
let input = input.into();
|
||||
|
||||
let first = input.find(char::is_uppercase);
|
||||
|
||||
if let Some(first_idx) = first {
|
||||
let mut output = String::from(&input[..first_idx]);
|
||||
output.reserve(input.len() - first_idx);
|
||||
|
||||
for c in input[first_idx..].chars() {
|
||||
if c.is_uppercase() {
|
||||
output.push(c.to_lowercase().next().unwrap())
|
||||
} else {
|
||||
output.push(c)
|
||||
}
|
||||
}
|
||||
|
||||
Cow::Owned(output)
|
||||
} else {
|
||||
input
|
||||
}
|
||||
}
|
||||
|
||||
/// remove ascii and some utf-8 punctuation characters from the given string
|
||||
pub fn remove_punctuation(text: &str) -> String {
|
||||
// non-separator type chars can be replaced with an empty string, while separators are replaced
|
||||
// with a space. This attempts to keep things like
|
||||
// 'aboutblogfaqcontactpresstermslexicondisclosure' from happening
|
||||
text.replace(
|
||||
[
|
||||
'!', '\\', '"', '#', '$', '%', '&', '(', ')', '*', '+', ':', ';', '<', '=', '>', '?',
|
||||
'@', '[', ']', '^', '{', '}', '|', '~', ',', '\'', '“', '”', '’', '‘', '’', '‘',
|
||||
],
|
||||
"",
|
||||
)
|
||||
.replace(['/', '–', '—', '.'], " ")
|
||||
}
|
||||
|
||||
/// remove stop words from the given string
|
||||
pub fn remove_stop_words(text: &str) -> String {
|
||||
BOUNDED_WORD_REGEX
|
||||
.replace_all(text, |caps: &Captures| {
|
||||
let word = &caps[0];
|
||||
if !STOP_WORDS.contains(&word) {
|
||||
word.to_owned()
|
||||
} else {
|
||||
String::new()
|
||||
}
|
||||
})
|
||||
.into()
|
||||
}
|
||||
|
||||
/// calculate inverse document frequency
|
||||
pub fn inverse_document_frequency(num_docs: f32, doc_frequency: f32) -> f32 {
|
||||
f32::log10(num_docs / doc_frequency)
|
||||
}
|
||||
|
||||
/// calculate term frequency-inverse document frequency (tf-idf)
|
||||
pub fn tf_idf_score(term_frequency: f32, idf: f32) -> f32 {
|
||||
term_frequency * idf
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
/// ensure all expected punctuation characters are removed
|
||||
fn test_remove_punctuation() {
|
||||
let tester = "!\\\"#$%&()*+/:;<=>?@[]^{}|~,.'“”’‘–—\n‘’";
|
||||
// the `" \n"` is because of the things like / getting replaced with a space
|
||||
assert_eq!(remove_punctuation(tester), " \n");
|
||||
}
|
||||
|
||||
#[test]
|
||||
/// ensure uppercase characters are swapped to lowercase
|
||||
fn test_normalize_case() {
|
||||
let tester = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
|
||||
assert_eq!(normalize_case(tester), "abcdefghijklmnopqrstuvwxyz");
|
||||
}
|
||||
|
||||
#[test]
|
||||
/// ensure all stop words are removed from the list of stopwords ... intestuous
|
||||
fn test_remove_stopwords() {
|
||||
let all_words = STOP_WORDS
|
||||
.iter()
|
||||
.map(|&word| word.to_string())
|
||||
.collect::<Vec<_>>()
|
||||
.join(" ");
|
||||
|
||||
let removed = remove_stop_words(&all_words).replace(' ', "");
|
||||
|
||||
// the remaining chars are from the contraction-based stop words
|
||||
assert_eq!(removed, "'d'll'm''s'ven'tn‘tn’t‘d‘ll‘m‘‘s‘ve’d’ll’m’’s’ve");
|
||||
}
|
||||
|
||||
#[test]
|
||||
/// ensure preprocess
|
||||
fn test_preprocess_results() {
|
||||
let tester = "WHY are Y'all YELLing?";
|
||||
assert_eq!(&preprocess(tester), &["yall", "yelling"]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
/// ensure our calculations conform to the example provided at the link below
|
||||
///
|
||||
/// https://www.kaggle.com/paulrohan2020/tf-idf-tutorial/notebook#TF-IDF-Model
|
||||
///
|
||||
/// Consider a document containing 100 words wherein the word cat appears 3 times.
|
||||
/// The term frequency (i.e., tf) for cat is then (3 / 100) = 0.03. Now, assume we have 10
|
||||
/// million documents and the word cat appears in one thousand of these. Then, the inverse
|
||||
/// document frequency (i.e., idf) is calculated as log(10,000,000 / 1,000) = 4. Thus, the
|
||||
/// Tf-idf weight is the product of these quantities: 0.03 * 4 = 0.12.
|
||||
fn idf_returns_expected_value() {
|
||||
let num_docs = 10_000_000_f32;
|
||||
let num_occurrences = 1_000_f32;
|
||||
let abs_diff = (inverse_document_frequency(num_docs, num_occurrences) - 4.0).abs();
|
||||
|
||||
assert!(abs_diff <= f32::EPSILON);
|
||||
}
|
||||
|
||||
#[test]
|
||||
/// ensure our calculations conform to the example provided at the link below
|
||||
///
|
||||
/// https://www.kaggle.com/paulrohan2020/tf-idf-tutorial/notebook#TF-IDF-Model
|
||||
///
|
||||
/// Consider a document containing 100 words wherein the word cat appears 3 times.
|
||||
/// The term frequency (i.e., tf) for cat is then (3 / 100) = 0.03. Now, assume we have 10
|
||||
/// million documents and the word cat appears in one thousand of these. Then, the inverse
|
||||
/// document frequency (i.e., idf) is calculated as log(10,000,000 / 1,000) = 4. Thus, the
|
||||
/// Tf-idf weight is the product of these quantities: 0.03 * 4 = 0.12.
|
||||
fn tf_idf_returns_expected_value() {
|
||||
let term_freq = 0.03_f32;
|
||||
let num_docs = 10_000_000_f32;
|
||||
let num_occurrences = 1_000_f32;
|
||||
let idf = inverse_document_frequency(num_docs, num_occurrences);
|
||||
let abs_diff = (tf_idf_score(term_freq, idf) - 0.12).abs();
|
||||
|
||||
assert!(abs_diff <= f32::EPSILON);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user