removed wildcard test, integrated into 404 detection

This commit is contained in:
epi
2023-02-25 20:58:28 -06:00
parent ac7f59cd3f
commit 655364d9bd
11 changed files with 218 additions and 142 deletions

View File

@@ -208,7 +208,7 @@ impl Banner {
let status_codes = if config.status_codes.len() == DEFAULT_STATUS_CODES.len() + 2 {
let all_str = format!(
"{} {} {}{}",
style("All").blue(),
style("All").cyan(),
style("Status").green(),
style("Codes").yellow(),
style("!").red()

View File

@@ -76,6 +76,8 @@ impl FeroxFilters {
for filter in filters.iter() {
// wildcard.should_filter goes here
if filter.should_filter_response(response) {
log::warn!("filtering response due to: {:?}", filter);
log::warn!("filtering response due to: {:?}", filters);
if filter.as_any().downcast_ref::<WildcardFilter>().is_some() {
tx_stats
.send(AddToUsizeField(WildcardsFiltered, 1))

View File

@@ -11,7 +11,7 @@ pub(crate) use self::empty::EmptyFilter;
pub use self::init::initialize;
pub use self::lines::LinesFilter;
pub use self::regex::RegexFilter;
pub use self::similarity::{SimilarityFilter, HashValueType};
pub use self::similarity::{SimilarityFilter, SIM_HASHER};
pub use self::size::SizeFilter;
pub use self::status_code::StatusCodeFilter;
pub(crate) use self::utils::{create_similarity_filter, filter_lookup};

View File

@@ -1,33 +1,26 @@
use super::*;
use fuzzyhash::FuzzyHash;
use gaoya::minhash::{MinHasher, MinHasher16};
use gaoya::text::whitespace_split;
use gaoya::simhash::{SimHash, SimHashBits, SimSipHasher64};
use crate::nlp::preprocess;
use lazy_static::lazy_static;
/// enum wrapper for two distinct hashing signature types
#[derive(Debug, PartialEq, Eq, Serialize, Deserialize)]
pub enum HashValueType {
/// String value for FuzzyHash
String(String),
/// Vec<u16> value for minhash
Vec(Vec<u16>),
lazy_static! {
/// single instance of the sip hasher used in similarity filtering
pub static ref SIM_HASHER: SimHash<SimSipHasher64, u64, 64> =
SimHash::<SimSipHasher64, u64, 64>::new(SimSipHasher64::new(1, 2));
}
impl Default for HashValueType {
fn default() -> Self {
Self::String(String::new())
}
}
/// maximum hamming distance allowed between two signatures
///
/// ref: https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/33026.pdf
/// section: 4.1 Choice of Parameters
const MAX_HAMMING_DISTANCE: usize = 3;
/// Simple implementor of FeroxFilter; used to filter out responses based on the similarity of a
/// Response body with a known response; specified using --filter-similar-to
#[derive(Default, Debug, PartialEq, Eq, Serialize, Deserialize)]
pub struct SimilarityFilter {
/// Hash of Response's body to be used during similarity comparison
pub hash: HashValueType,
/// Percentage of similarity at which a page is determined to be a near-duplicate of another
pub threshold: u32,
pub hash: u64,
/// Url originally requested for the similarity filter
pub original_url: String,
@@ -38,30 +31,8 @@ impl FeroxFilter for SimilarityFilter {
/// Check `FeroxResponse::text` against what was requested from the site passed in via
/// --filter-similar-to
fn should_filter_response(&self, response: &FeroxResponse) -> bool {
match self.hash {
HashValueType::String(ref hash) => {
// original response size was over the minimum required to effectively use ssdeep
let other = FuzzyHash::new(response.text());
if let Ok(result) = FuzzyHash::compare(hash, other.to_string()) {
return result >= self.threshold;
}
}
HashValueType::Vec(ref hash) => {
// original response was too small for ssdeep, so minhash was used as an alternative
let hasher = MinHasher16::new(256);
let other = hasher.create_signature(whitespace_split(response.text()));
let result = hasher.compute_similarity(hash.iter(), other.iter());
return (result * 100.0) as u32 >= self.threshold;
}
}
// couldn't hash the response, don't filter
log::warn!(
"Could not compare similarity of body from {}; returning not-similar",
response.url().as_str()
);
false
let other = SIM_HASHER.create_signature(preprocess(response.text()).iter());
return self.hash.hamming_distance(&other) <= MAX_HAMMING_DISTANCE;
}
/// Compare one SimilarityFilter to another

View File

@@ -1,14 +1,12 @@
use super::similarity::HashValueType;
use super::FeroxFilter;
use super::SimilarityFilter;
use crate::event_handlers::Handles;
use crate::filters::similarity::SIM_HASHER;
use crate::nlp::preprocess;
use crate::response::FeroxResponse;
use crate::utils::logged_request;
use crate::{DEFAULT_METHOD, MIN_SSDEEP_SIZE, SIMILARITY_THRESHOLD};
use crate::DEFAULT_METHOD;
use anyhow::Result;
use fuzzyhash::FuzzyHash;
use gaoya::minhash::{MinHasher, MinHasher16};
use gaoya::text::whitespace_split;
use regex::Regex;
use reqwest::Url;
use std::sync::Arc;
@@ -43,21 +41,10 @@ pub(crate) async fn create_similarity_filter(
fr.parse_extension(handles.clone())?;
}
// hash the response body and store the resulting hash in the filter object
let hash = if fr.content_length() <= MIN_SSDEEP_SIZE {
log::debug!("response too small for ssdeep, using minhash for comparison");
// response too small for ssdeep
let hasher = MinHasher16::new(256);
HashValueType::Vec(hasher.create_signature(whitespace_split(fr.text())))
} else {
// size over ssdeep's minimum value
log::debug!("response large enough to use ssdeep for comparison");
HashValueType::String(FuzzyHash::new(fr.text()).to_string())
};
let hash = SIM_HASHER.create_signature(preprocess(fr.text()).iter());
Ok(SimilarityFilter {
hash,
threshold: SIMILARITY_THRESHOLD,
original_url: similarity_filter.to_string(),
})
}
@@ -107,8 +94,7 @@ pub(crate) fn filter_lookup(filter_type: &str, filter_value: &str) -> Option<Box
}
"similarity" => {
return Some(Box::new(SimilarityFilter {
hash: HashValueType::String(String::new()),
threshold: SIMILARITY_THRESHOLD,
hash: 0,
original_url: filter_value.to_string(),
}));
}
@@ -169,8 +155,7 @@ mod tests {
assert_eq!(
filter.as_any().downcast_ref::<SimilarityFilter>().unwrap(),
&SimilarityFilter {
hash: HashValueType::String(String::new()),
threshold: SIMILARITY_THRESHOLD,
hash: 0,
original_url: "http://localhost".to_string()
}
);
@@ -208,7 +193,6 @@ mod tests {
filter,
SimilarityFilter {
hash: HashValueType::String("3:YKEpn:Yfp".to_string()),
threshold: SIMILARITY_THRESHOLD,
original_url: srv.url("/")
}
);

View File

@@ -5,7 +5,9 @@ use console::style;
use scraper::{Html, Selector};
use uuid::Uuid;
use crate::filters::{SimilarityFilter, SIM_HASHER};
use crate::message::FeroxMessage;
use crate::nlp::preprocess;
use crate::{
config::OutputLevel,
event_handlers::{Command, Handles},
@@ -183,10 +185,6 @@ impl HeuristicTests {
if wc2_length == wc_length + (UUID_LENGTH * 2) {
// second length is what we'd expect to see if the requested url is
// reflected in the response along with some static content; aka custom 404
let url_len = ferox_url.path_length()?;
log::warn!("{:?}", dbg!(url_len, wc_length, wc2_length));
log::warn!("{:?}", ferox_url);
wildcard.dynamic = wc_length - UUID_LENGTH;
print_dont_filter_message(
@@ -442,73 +440,188 @@ impl HeuristicTests {
/// given a target's base url, attempt to automatically detect its 404 response
/// pattern, and then set a filter that will exclude all but the first result
pub async fn detect_404_response(&self, target_url: &str) -> Result<bool> {
log::trace!("enter: detect_404_response");
pub async fn detect_404_response(&self, target_url: &str) -> Result<u64> {
log::trace!("enter: detect_404_response({:?})", target_url);
if self.handles.config.dont_filter {
// early return, dont_filter scans don't need tested
log::trace!("exit: detect_404_response -> dont_filter is true");
return Ok(false);
}
let mut responses = Vec::with_capacity(3);
for prefix in ["", ".htaccess", "admin"] {
let path = format!("{prefix}{}", self.unique_string(1));
let ferox_url = FeroxUrl::from_string(target_url, self.handles.clone());
let request = ferox_url.format(&path, None)?;
let response =
logged_request(&request, DEFAULT_METHOD, None, self.handles.clone()).await;
let response = skip_fail!(response);
let ferox_response = FeroxResponse::from(
response,
&ferox_url.target,
DEFAULT_METHOD,
self.handles.config.output_level,
)
.await;
responses.push(ferox_response);
return Ok(0);
}
let mut size_sentry = true;
let mut word_sentry = true;
let mut line_sentry = true;
let mut req_counter = 0;
let content_length = responses[0].content_length();
let word_count = responses[0].word_count();
let line_count = responses[0].line_count();
let data = if self.handles.config.data.is_empty() {
None
} else {
Some(self.handles.config.data.as_slice())
};
for response in &responses[1..] {
if response.content_length() != content_length {
size_sentry = false;
// 4 is due to the array in the nested for loop below
let mut responses = Vec::with_capacity(4);
for method in self.handles.config.methods.iter() {
for (prefix, length) in [("", 1), ("", 3), (".htaccess", 1), ("admin", 1)] {
let path = format!("{prefix}{}", self.unique_string(length));
// To take care of slash when needed
let slash = if self.handles.config.add_slash {
Some("/")
} else {
None
};
let ferox_url = FeroxUrl::from_string(target_url, self.handles.clone());
let nonexistent_url = ferox_url.format(&path, slash)?;
let response =
logged_request(&nonexistent_url, &method, data, self.handles.clone()).await;
req_counter += 1;
// continue to next on error
let response = skip_fail!(response);
if !self
.handles
.config
.status_codes
.contains(&response.status().as_u16())
{
// if the response code isn't one that's accepted via -s values, then skip to the next
//
// the default value for -s is all status codes, so unless the user says otherwise
// this won't fire
continue;
}
let ferox_response = FeroxResponse::from(
response,
&ferox_url.target,
&method,
self.handles.config.output_level,
)
.await;
responses.push(ferox_response);
}
if response.word_count() != word_count {
word_sentry = false;
if responses.len() < 2 {
// don't have enough responses to make a determination, continue to next method
continue;
}
if response.line_count() != line_count {
line_sentry = false;
// examine chars/words/lines for each response
// if all responses respetive length match each other, we can assume
// that will remain true for subsequent non-existent urls, and create
// a filter for it.
//
// values are examined from most to least specific (content length, word count, line count)
let content_length = responses[0].content_length();
let word_count = responses[0].word_count();
let line_count = responses[0].line_count();
for response in &responses[1..] {
// if any of the responses differ in length, that particular
// response length type is no longer a canidate for filtering
if response.content_length() != content_length {
size_sentry = false;
}
if response.word_count() != word_count {
word_sentry = false;
}
if response.line_count() != line_count {
line_sentry = false;
}
}
// the if/else-if/else nature of the block means that we'll get the most
// specific match, if one is to be had
//
// each block returns the information needed to send the filter away and
// display a message to the user
let (command, filter_type, filter_length) = if size_sentry {
log::info!(
"[404-like] {target_url} => filtering future responses with {content_length} bytes"
);
(
Command::AddFilter(Box::new(SizeFilter { content_length })),
"bytes",
content_length as usize,
)
} else if word_sentry {
log::info!(
"[404-like] {target_url} => filtering future responses with {word_count} words"
);
(
Command::AddFilter(Box::new(WordsFilter { word_count })),
"words",
word_count,
)
} else if line_sentry {
log::info!(
"[404-like] {target_url} => filtering future responses with {line_count} lines"
);
(
Command::AddFilter(Box::new(LinesFilter { line_count })),
"lines",
line_count,
)
} else {
log::trace!("exit: detect_404_response -> no filter added");
// no match was found; clear the vec and continue to the next
responses.clear();
continue;
};
match command {
Command::AddFilter(ref filter) => {
if let Ok(guard) = self.handles.filters.data.filters.read() {
if guard.contains(filter) {
// match was found, but already known; clear the vec and continue to the next
responses.clear();
continue;
}
}
}
_ => {}
}
// if we're here, we've detected a 404-like response pattern, so we're already filtering for size/word/line
// in addition, we'll create a similarity filter as a fallback
let hash = SIM_HASHER.create_signature(preprocess(responses[0].text()).iter());
let sim_filter = SimilarityFilter {
hash,
original_url: responses[0].url().to_string(),
};
self.handles.filters.send(command)?;
// reset the responses for the next method, if it exists
responses.clear();
self.handles
.filters
.send(Command::AddFilter(Box::new(sim_filter)))?;
if matches!(
self.handles.config.output_level,
OutputLevel::Default | OutputLevel::Quiet
) {
let msg = format!("{} {:>8} {:>9} {:>9} {:>9} {} => {} {}-like response ({} {}); toggle this behavior by using {}\n", status_colorizer("WLD"), "-", "-", "-", "-", style(target_url).cyan(), style("auto-filtering").bright().green(), style("404").red(), style(filter_length).cyan(), filter_type, style("--dont-filter").yellow());
ferox_print(&msg, &PROGRESS_PRINTER);
}
}
let command = if size_sentry {
Command::AddFilter(Box::new(SizeFilter { content_length }))
} else if word_sentry {
Command::AddFilter(Box::new(WordsFilter { word_count }))
} else if line_sentry {
Command::AddFilter(Box::new(LinesFilter { line_count }))
} else {
log::trace!("exit: detect_404_response -> no filter added");
return Ok(false);
};
self.handles.filters.send(command)?;
log::trace!("exit: detect_404_response");
Ok(size_sentry || word_sentry || line_sentry)
return Ok(req_counter);
}
}

View File

@@ -52,14 +52,6 @@ pub(crate) const VERSION: &str = env!("CARGO_PKG_VERSION");
/// Maximum number of file descriptors that can be opened during a scan
pub const DEFAULT_OPEN_FILE_LIMIT: u64 = 8192;
/// Default value used to determine near-duplicate web pages (equivalent to 95%)
pub const SIMILARITY_THRESHOLD: u32 = 95;
/// Minimum size of response body for ssdeep to produce meaningful results
///
/// ref: https://github.com/glaslos/ssdeep/issues/17
pub(crate) const MIN_SSDEEP_SIZE: u64 = 4096;
/// Default set of extensions to Ignore when auto-collecting extensions during scans
pub(crate) const DEFAULT_IGNORED_EXTENSIONS: [&str; 38] = [
"tif", "tiff", "ico", "cur", "bmp", "webp", "svg", "png", "jpg", "jpeg", "jfif", "gif", "avif",
@@ -116,7 +108,7 @@ pub const DEFAULT_STATUS_CODES: [StatusCode; 60] = [
StatusCode::USE_PROXY,
StatusCode::TEMPORARY_REDIRECT,
StatusCode::PERMANENT_REDIRECT,
// all 4XX response codes
// all 4XX response codes
StatusCode::BAD_REQUEST,
StatusCode::UNAUTHORIZED,
StatusCode::PAYMENT_REQUIRED,

View File

@@ -8,3 +8,4 @@ mod utils;
pub(crate) use self::document::Document;
pub(crate) use self::model::TfIdf;
pub(crate) use self::utils::preprocess;

View File

@@ -4,7 +4,7 @@ use std::borrow::Cow;
/// pre-processing pipeline wrapper that removes punctuation, normalizes word case (utf-8 included)
/// to lowercase, and remove stop words
pub(super) fn preprocess(text: &str) -> Vec<String> {
pub(crate) fn preprocess(text: &str) -> Vec<String> {
let text = remove_punctuation(text);
let text = normalize_case(text);
let text = remove_stop_words(&text);

View File

@@ -453,7 +453,7 @@ impl FeroxSerialize for FeroxResponse {
// create the base message
let mut message = format!(
"{} {:>8} {:>8}l {:>8}w {:>8}c Got {} for {} (url length: {})\n",
"{} {:>8} {:>8}l {:>8}w {:>8}c Got {} for {}\n",
wild_status,
method,
lines,
@@ -461,7 +461,6 @@ impl FeroxSerialize for FeroxResponse {
chars,
status_colorizer(status),
self.url(),
FeroxUrl::path_length_of_url(&self.url)
);
if self.status().is_redirection() {

View File

@@ -244,7 +244,7 @@ impl FeroxScanner {
}
{
// heuristics test block
// heuristics test block:
let test = heuristics::HeuristicTests::new(self.handles.clone());
if let Ok(dirlist_result) = test.directory_listing(&self.target_url).await {
@@ -294,15 +294,29 @@ impl FeroxScanner {
}
}
let detected = test.detect_404_response(&self.target_url).await?;
log::info!("404 response detected: {}", detected);
// now that we haven't found a directory listing, we'll attempt to derive whatever
// the server is using to respond to resources that don't exist (could be a
// traditional 404, or a custom response)
//
// `detect_404_response` will make the requests that the wildcard test used to
// perform pre-2.8, and then hand those off to `wildcard`, so we're not
// duplicating effort.
//
// the wildcard test will only add a filter in the event that the filter it
// would create isn't already being filtered by the user or by the
// auto-detected 404-like response
let num_reqs_made = test.detect_404_response(&self.target_url).await?;
if !detected {
// on error, we'll have 0, same for --dont-filter
// anything higher than 0 indicates a wildcard was found
let num_reqs_made = test.wildcard(&self.target_url).await.unwrap_or_default();
progress_bar.inc(num_reqs_made);
}
// let num_reqs_made = test.wildcard(detection_responses).await.unwrap_or_default();
progress_bar.inc(num_reqs_made);
// if !detected {
// // on error, we'll have 0, same for --dont-filter
// // anything higher than 0 indicates a wildcard was found
// let num_reqs_made = test.wildcard(&self.target_url).await.unwrap_or_default();
// progress_bar.inc(num_reqs_made);
// }
}
// Arc clones to be passed around to the various scans