diff --git a/src/banner/container.rs b/src/banner/container.rs index 8182298..59a6d71 100644 --- a/src/banner/container.rs +++ b/src/banner/container.rs @@ -208,7 +208,7 @@ impl Banner { let status_codes = if config.status_codes.len() == DEFAULT_STATUS_CODES.len() + 2 { let all_str = format!( "{} {} {}{}", - style("All").blue(), + style("All").cyan(), style("Status").green(), style("Codes").yellow(), style("!").red() diff --git a/src/filters/container.rs b/src/filters/container.rs index f3f4cf2..c44fc11 100644 --- a/src/filters/container.rs +++ b/src/filters/container.rs @@ -76,6 +76,8 @@ impl FeroxFilters { for filter in filters.iter() { // wildcard.should_filter goes here if filter.should_filter_response(response) { + log::warn!("filtering response due to: {:?}", filter); + log::warn!("filtering response due to: {:?}", filters); if filter.as_any().downcast_ref::().is_some() { tx_stats .send(AddToUsizeField(WildcardsFiltered, 1)) diff --git a/src/filters/mod.rs b/src/filters/mod.rs index 04828b1..443230d 100644 --- a/src/filters/mod.rs +++ b/src/filters/mod.rs @@ -11,7 +11,7 @@ pub(crate) use self::empty::EmptyFilter; pub use self::init::initialize; pub use self::lines::LinesFilter; pub use self::regex::RegexFilter; -pub use self::similarity::{SimilarityFilter, HashValueType}; +pub use self::similarity::{SimilarityFilter, SIM_HASHER}; pub use self::size::SizeFilter; pub use self::status_code::StatusCodeFilter; pub(crate) use self::utils::{create_similarity_filter, filter_lookup}; diff --git a/src/filters/similarity.rs b/src/filters/similarity.rs index 961747a..7d1dc02 100644 --- a/src/filters/similarity.rs +++ b/src/filters/similarity.rs @@ -1,33 +1,26 @@ use super::*; -use fuzzyhash::FuzzyHash; -use gaoya::minhash::{MinHasher, MinHasher16}; -use gaoya::text::whitespace_split; +use gaoya::simhash::{SimHash, SimHashBits, SimSipHasher64}; +use crate::nlp::preprocess; +use lazy_static::lazy_static; -/// enum wrapper for two distinct hashing signature types -#[derive(Debug, PartialEq, Eq, Serialize, Deserialize)] -pub enum HashValueType { - /// String value for FuzzyHash - String(String), - - /// Vec value for minhash - Vec(Vec), +lazy_static! { + /// single instance of the sip hasher used in similarity filtering + pub static ref SIM_HASHER: SimHash = + SimHash::::new(SimSipHasher64::new(1, 2)); } -impl Default for HashValueType { - fn default() -> Self { - Self::String(String::new()) - } -} +/// maximum hamming distance allowed between two signatures +/// +/// ref: https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/33026.pdf +/// section: 4.1 Choice of Parameters +const MAX_HAMMING_DISTANCE: usize = 3; /// Simple implementor of FeroxFilter; used to filter out responses based on the similarity of a /// Response body with a known response; specified using --filter-similar-to #[derive(Default, Debug, PartialEq, Eq, Serialize, Deserialize)] pub struct SimilarityFilter { /// Hash of Response's body to be used during similarity comparison - pub hash: HashValueType, - - /// Percentage of similarity at which a page is determined to be a near-duplicate of another - pub threshold: u32, + pub hash: u64, /// Url originally requested for the similarity filter pub original_url: String, @@ -38,30 +31,8 @@ impl FeroxFilter for SimilarityFilter { /// Check `FeroxResponse::text` against what was requested from the site passed in via /// --filter-similar-to fn should_filter_response(&self, response: &FeroxResponse) -> bool { - match self.hash { - HashValueType::String(ref hash) => { - // original response size was over the minimum required to effectively use ssdeep - let other = FuzzyHash::new(response.text()); - - if let Ok(result) = FuzzyHash::compare(hash, other.to_string()) { - return result >= self.threshold; - } - } - HashValueType::Vec(ref hash) => { - // original response was too small for ssdeep, so minhash was used as an alternative - let hasher = MinHasher16::new(256); - let other = hasher.create_signature(whitespace_split(response.text())); - let result = hasher.compute_similarity(hash.iter(), other.iter()); - return (result * 100.0) as u32 >= self.threshold; - } - } - - // couldn't hash the response, don't filter - log::warn!( - "Could not compare similarity of body from {}; returning not-similar", - response.url().as_str() - ); - false + let other = SIM_HASHER.create_signature(preprocess(response.text()).iter()); + return self.hash.hamming_distance(&other) <= MAX_HAMMING_DISTANCE; } /// Compare one SimilarityFilter to another diff --git a/src/filters/utils.rs b/src/filters/utils.rs index 9a69e82..98705fa 100644 --- a/src/filters/utils.rs +++ b/src/filters/utils.rs @@ -1,14 +1,12 @@ -use super::similarity::HashValueType; use super::FeroxFilter; use super::SimilarityFilter; use crate::event_handlers::Handles; +use crate::filters::similarity::SIM_HASHER; +use crate::nlp::preprocess; use crate::response::FeroxResponse; use crate::utils::logged_request; -use crate::{DEFAULT_METHOD, MIN_SSDEEP_SIZE, SIMILARITY_THRESHOLD}; +use crate::DEFAULT_METHOD; use anyhow::Result; -use fuzzyhash::FuzzyHash; -use gaoya::minhash::{MinHasher, MinHasher16}; -use gaoya::text::whitespace_split; use regex::Regex; use reqwest::Url; use std::sync::Arc; @@ -43,21 +41,10 @@ pub(crate) async fn create_similarity_filter( fr.parse_extension(handles.clone())?; } - // hash the response body and store the resulting hash in the filter object - let hash = if fr.content_length() <= MIN_SSDEEP_SIZE { - log::debug!("response too small for ssdeep, using minhash for comparison"); - // response too small for ssdeep - let hasher = MinHasher16::new(256); - HashValueType::Vec(hasher.create_signature(whitespace_split(fr.text()))) - } else { - // size over ssdeep's minimum value - log::debug!("response large enough to use ssdeep for comparison"); - HashValueType::String(FuzzyHash::new(fr.text()).to_string()) - }; + let hash = SIM_HASHER.create_signature(preprocess(fr.text()).iter()); Ok(SimilarityFilter { hash, - threshold: SIMILARITY_THRESHOLD, original_url: similarity_filter.to_string(), }) } @@ -107,8 +94,7 @@ pub(crate) fn filter_lookup(filter_type: &str, filter_value: &str) -> Option { return Some(Box::new(SimilarityFilter { - hash: HashValueType::String(String::new()), - threshold: SIMILARITY_THRESHOLD, + hash: 0, original_url: filter_value.to_string(), })); } @@ -169,8 +155,7 @@ mod tests { assert_eq!( filter.as_any().downcast_ref::().unwrap(), &SimilarityFilter { - hash: HashValueType::String(String::new()), - threshold: SIMILARITY_THRESHOLD, + hash: 0, original_url: "http://localhost".to_string() } ); @@ -208,7 +193,6 @@ mod tests { filter, SimilarityFilter { hash: HashValueType::String("3:YKEpn:Yfp".to_string()), - threshold: SIMILARITY_THRESHOLD, original_url: srv.url("/") } ); diff --git a/src/heuristics.rs b/src/heuristics.rs index 36d7e00..95c4b51 100644 --- a/src/heuristics.rs +++ b/src/heuristics.rs @@ -5,7 +5,9 @@ use console::style; use scraper::{Html, Selector}; use uuid::Uuid; +use crate::filters::{SimilarityFilter, SIM_HASHER}; use crate::message::FeroxMessage; +use crate::nlp::preprocess; use crate::{ config::OutputLevel, event_handlers::{Command, Handles}, @@ -183,10 +185,6 @@ impl HeuristicTests { if wc2_length == wc_length + (UUID_LENGTH * 2) { // second length is what we'd expect to see if the requested url is // reflected in the response along with some static content; aka custom 404 - let url_len = ferox_url.path_length()?; - - log::warn!("{:?}", dbg!(url_len, wc_length, wc2_length)); - log::warn!("{:?}", ferox_url); wildcard.dynamic = wc_length - UUID_LENGTH; print_dont_filter_message( @@ -442,73 +440,188 @@ impl HeuristicTests { /// given a target's base url, attempt to automatically detect its 404 response /// pattern, and then set a filter that will exclude all but the first result - pub async fn detect_404_response(&self, target_url: &str) -> Result { - log::trace!("enter: detect_404_response"); + pub async fn detect_404_response(&self, target_url: &str) -> Result { + log::trace!("enter: detect_404_response({:?})", target_url); if self.handles.config.dont_filter { + // early return, dont_filter scans don't need tested log::trace!("exit: detect_404_response -> dont_filter is true"); - return Ok(false); - } - - let mut responses = Vec::with_capacity(3); - - for prefix in ["", ".htaccess", "admin"] { - let path = format!("{prefix}{}", self.unique_string(1)); - let ferox_url = FeroxUrl::from_string(target_url, self.handles.clone()); - let request = ferox_url.format(&path, None)?; - let response = - logged_request(&request, DEFAULT_METHOD, None, self.handles.clone()).await; - - let response = skip_fail!(response); - - let ferox_response = FeroxResponse::from( - response, - &ferox_url.target, - DEFAULT_METHOD, - self.handles.config.output_level, - ) - .await; - - responses.push(ferox_response); + return Ok(0); } let mut size_sentry = true; let mut word_sentry = true; let mut line_sentry = true; + let mut req_counter = 0; - let content_length = responses[0].content_length(); - let word_count = responses[0].word_count(); - let line_count = responses[0].line_count(); + let data = if self.handles.config.data.is_empty() { + None + } else { + Some(self.handles.config.data.as_slice()) + }; - for response in &responses[1..] { - if response.content_length() != content_length { - size_sentry = false; + // 4 is due to the array in the nested for loop below + let mut responses = Vec::with_capacity(4); + + for method in self.handles.config.methods.iter() { + for (prefix, length) in [("", 1), ("", 3), (".htaccess", 1), ("admin", 1)] { + let path = format!("{prefix}{}", self.unique_string(length)); + + // To take care of slash when needed + let slash = if self.handles.config.add_slash { + Some("/") + } else { + None + }; + + let ferox_url = FeroxUrl::from_string(target_url, self.handles.clone()); + + let nonexistent_url = ferox_url.format(&path, slash)?; + + let response = + logged_request(&nonexistent_url, &method, data, self.handles.clone()).await; + + req_counter += 1; + + // continue to next on error + let response = skip_fail!(response); + + if !self + .handles + .config + .status_codes + .contains(&response.status().as_u16()) + { + // if the response code isn't one that's accepted via -s values, then skip to the next + // + // the default value for -s is all status codes, so unless the user says otherwise + // this won't fire + continue; + } + + let ferox_response = FeroxResponse::from( + response, + &ferox_url.target, + &method, + self.handles.config.output_level, + ) + .await; + + responses.push(ferox_response); } - if response.word_count() != word_count { - word_sentry = false; + if responses.len() < 2 { + // don't have enough responses to make a determination, continue to next method + continue; } - if response.line_count() != line_count { - line_sentry = false; + // examine chars/words/lines for each response + // if all responses respetive length match each other, we can assume + // that will remain true for subsequent non-existent urls, and create + // a filter for it. + // + // values are examined from most to least specific (content length, word count, line count) + let content_length = responses[0].content_length(); + let word_count = responses[0].word_count(); + let line_count = responses[0].line_count(); + + for response in &responses[1..] { + // if any of the responses differ in length, that particular + // response length type is no longer a canidate for filtering + if response.content_length() != content_length { + size_sentry = false; + } + + if response.word_count() != word_count { + word_sentry = false; + } + + if response.line_count() != line_count { + line_sentry = false; + } + } + + // the if/else-if/else nature of the block means that we'll get the most + // specific match, if one is to be had + // + // each block returns the information needed to send the filter away and + // display a message to the user + let (command, filter_type, filter_length) = if size_sentry { + log::info!( + "[404-like] {target_url} => filtering future responses with {content_length} bytes" + ); + ( + Command::AddFilter(Box::new(SizeFilter { content_length })), + "bytes", + content_length as usize, + ) + } else if word_sentry { + log::info!( + "[404-like] {target_url} => filtering future responses with {word_count} words" + ); + ( + Command::AddFilter(Box::new(WordsFilter { word_count })), + "words", + word_count, + ) + } else if line_sentry { + log::info!( + "[404-like] {target_url} => filtering future responses with {line_count} lines" + ); + ( + Command::AddFilter(Box::new(LinesFilter { line_count })), + "lines", + line_count, + ) + } else { + log::trace!("exit: detect_404_response -> no filter added"); + // no match was found; clear the vec and continue to the next + responses.clear(); + continue; + }; + + match command { + Command::AddFilter(ref filter) => { + if let Ok(guard) = self.handles.filters.data.filters.read() { + if guard.contains(filter) { + // match was found, but already known; clear the vec and continue to the next + responses.clear(); + continue; + } + } + } + _ => {} + } + + // if we're here, we've detected a 404-like response pattern, so we're already filtering for size/word/line + // in addition, we'll create a similarity filter as a fallback + let hash = SIM_HASHER.create_signature(preprocess(responses[0].text()).iter()); + + let sim_filter = SimilarityFilter { + hash, + original_url: responses[0].url().to_string(), + }; + + self.handles.filters.send(command)?; + + // reset the responses for the next method, if it exists + responses.clear(); + + self.handles + .filters + .send(Command::AddFilter(Box::new(sim_filter)))?; + if matches!( + self.handles.config.output_level, + OutputLevel::Default | OutputLevel::Quiet + ) { + let msg = format!("{} {:>8} {:>9} {:>9} {:>9} {} => {} {}-like response ({} {}); toggle this behavior by using {}\n", status_colorizer("WLD"), "-", "-", "-", "-", style(target_url).cyan(), style("auto-filtering").bright().green(), style("404").red(), style(filter_length).cyan(), filter_type, style("--dont-filter").yellow()); + ferox_print(&msg, &PROGRESS_PRINTER); } } - let command = if size_sentry { - Command::AddFilter(Box::new(SizeFilter { content_length })) - } else if word_sentry { - Command::AddFilter(Box::new(WordsFilter { word_count })) - } else if line_sentry { - Command::AddFilter(Box::new(LinesFilter { line_count })) - } else { - log::trace!("exit: detect_404_response -> no filter added"); - return Ok(false); - }; - - self.handles.filters.send(command)?; - log::trace!("exit: detect_404_response"); - Ok(size_sentry || word_sentry || line_sentry) + + return Ok(req_counter); } } diff --git a/src/lib.rs b/src/lib.rs index e09841a..c327d5a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -52,14 +52,6 @@ pub(crate) const VERSION: &str = env!("CARGO_PKG_VERSION"); /// Maximum number of file descriptors that can be opened during a scan pub const DEFAULT_OPEN_FILE_LIMIT: u64 = 8192; -/// Default value used to determine near-duplicate web pages (equivalent to 95%) -pub const SIMILARITY_THRESHOLD: u32 = 95; - -/// Minimum size of response body for ssdeep to produce meaningful results -/// -/// ref: https://github.com/glaslos/ssdeep/issues/17 -pub(crate) const MIN_SSDEEP_SIZE: u64 = 4096; - /// Default set of extensions to Ignore when auto-collecting extensions during scans pub(crate) const DEFAULT_IGNORED_EXTENSIONS: [&str; 38] = [ "tif", "tiff", "ico", "cur", "bmp", "webp", "svg", "png", "jpg", "jpeg", "jfif", "gif", "avif", @@ -116,7 +108,7 @@ pub const DEFAULT_STATUS_CODES: [StatusCode; 60] = [ StatusCode::USE_PROXY, StatusCode::TEMPORARY_REDIRECT, StatusCode::PERMANENT_REDIRECT, - // all 4XX response codes + // all 4XX response codes StatusCode::BAD_REQUEST, StatusCode::UNAUTHORIZED, StatusCode::PAYMENT_REQUIRED, diff --git a/src/nlp/mod.rs b/src/nlp/mod.rs index 6fad768..6da1ec6 100644 --- a/src/nlp/mod.rs +++ b/src/nlp/mod.rs @@ -8,3 +8,4 @@ mod utils; pub(crate) use self::document::Document; pub(crate) use self::model::TfIdf; +pub(crate) use self::utils::preprocess; diff --git a/src/nlp/utils.rs b/src/nlp/utils.rs index f9c84d4..2530122 100644 --- a/src/nlp/utils.rs +++ b/src/nlp/utils.rs @@ -4,7 +4,7 @@ use std::borrow::Cow; /// pre-processing pipeline wrapper that removes punctuation, normalizes word case (utf-8 included) /// to lowercase, and remove stop words -pub(super) fn preprocess(text: &str) -> Vec { +pub(crate) fn preprocess(text: &str) -> Vec { let text = remove_punctuation(text); let text = normalize_case(text); let text = remove_stop_words(&text); diff --git a/src/response.rs b/src/response.rs index c5d2eec..4c04921 100644 --- a/src/response.rs +++ b/src/response.rs @@ -453,7 +453,7 @@ impl FeroxSerialize for FeroxResponse { // create the base message let mut message = format!( - "{} {:>8} {:>8}l {:>8}w {:>8}c Got {} for {} (url length: {})\n", + "{} {:>8} {:>8}l {:>8}w {:>8}c Got {} for {}\n", wild_status, method, lines, @@ -461,7 +461,6 @@ impl FeroxSerialize for FeroxResponse { chars, status_colorizer(status), self.url(), - FeroxUrl::path_length_of_url(&self.url) ); if self.status().is_redirection() { diff --git a/src/scanner/ferox_scanner.rs b/src/scanner/ferox_scanner.rs index d011d6b..43bf14c 100644 --- a/src/scanner/ferox_scanner.rs +++ b/src/scanner/ferox_scanner.rs @@ -244,7 +244,7 @@ impl FeroxScanner { } { - // heuristics test block + // heuristics test block: let test = heuristics::HeuristicTests::new(self.handles.clone()); if let Ok(dirlist_result) = test.directory_listing(&self.target_url).await { @@ -294,15 +294,29 @@ impl FeroxScanner { } } - let detected = test.detect_404_response(&self.target_url).await?; - log::info!("404 response detected: {}", detected); + // now that we haven't found a directory listing, we'll attempt to derive whatever + // the server is using to respond to resources that don't exist (could be a + // traditional 404, or a custom response) + // + // `detect_404_response` will make the requests that the wildcard test used to + // perform pre-2.8, and then hand those off to `wildcard`, so we're not + // duplicating effort. + // + // the wildcard test will only add a filter in the event that the filter it + // would create isn't already being filtered by the user or by the + // auto-detected 404-like response + let num_reqs_made = test.detect_404_response(&self.target_url).await?; - if !detected { - // on error, we'll have 0, same for --dont-filter - // anything higher than 0 indicates a wildcard was found - let num_reqs_made = test.wildcard(&self.target_url).await.unwrap_or_default(); - progress_bar.inc(num_reqs_made); - } + + // let num_reqs_made = test.wildcard(detection_responses).await.unwrap_or_default(); + progress_bar.inc(num_reqs_made); + + // if !detected { + // // on error, we'll have 0, same for --dont-filter + // // anything higher than 0 indicates a wildcard was found + // let num_reqs_made = test.wildcard(&self.target_url).await.unwrap_or_default(); + // progress_bar.inc(num_reqs_made); + // } } // Arc clones to be passed around to the various scans