From 96375e7734c0c31cc2e38d8b96ed2f2cfbcb7026 Mon Sep 17 00:00:00 2001 From: epi Date: Sat, 25 Feb 2023 06:20:30 -0600 Subject: [PATCH] added minhash algo when resp too short for ssdeep --- src/filters/mod.rs | 2 +- src/filters/similarity.rs | 41 +++++++++++++++++++++++++++++++++++---- src/filters/tests.rs | 12 ++++++------ src/filters/utils.rs | 20 ++++++++++++++----- src/heuristics.rs | 1 - src/lib.rs | 5 +++++ src/scan_manager/tests.rs | 4 ++-- 7 files changed, 66 insertions(+), 19 deletions(-) diff --git a/src/filters/mod.rs b/src/filters/mod.rs index bf855f8..04828b1 100644 --- a/src/filters/mod.rs +++ b/src/filters/mod.rs @@ -11,7 +11,7 @@ pub(crate) use self::empty::EmptyFilter; pub use self::init::initialize; pub use self::lines::LinesFilter; pub use self::regex::RegexFilter; -pub use self::similarity::SimilarityFilter; +pub use self::similarity::{SimilarityFilter, HashValueType}; pub use self::size::SizeFilter; pub use self::status_code::StatusCodeFilter; pub(crate) use self::utils::{create_similarity_filter, filter_lookup}; diff --git a/src/filters/similarity.rs b/src/filters/similarity.rs index f18068d..2879980 100644 --- a/src/filters/similarity.rs +++ b/src/filters/similarity.rs @@ -1,12 +1,33 @@ +use std::hash::BuildHasherDefault; + use super::*; +use crate::MIN_SSDEEP_SIZE; use fuzzyhash::FuzzyHash; +use gaoya::minhash::{MinHash, MinHasher, MinHasher16}; +use gaoya::text::whitespace_split; + +/// enum wrapper for two distinct hashing signature types +#[derive(Debug, PartialEq, Eq, Serialize, Deserialize)] +pub(crate) enum HashValueType { + /// String value for FuzzyHash + String(String), + + /// Vec value for minhash + Vec(Vec), +} + +impl Default for HashValueType { + fn default() -> Self { + Self::String(String::new()) + } +} /// Simple implementor of FeroxFilter; used to filter out responses based on the similarity of a /// Response body with a known response; specified using --filter-similar-to #[derive(Default, Debug, PartialEq, Eq, Serialize, Deserialize)] pub struct SimilarityFilter { /// Hash of Response's body to be used during similarity comparison - pub hash: String, + pub hash: HashValueType, /// Percentage of similarity at which a page is determined to be a near-duplicate of another pub threshold: u32, @@ -20,10 +41,22 @@ impl FeroxFilter for SimilarityFilter { /// Check `FeroxResponse::text` against what was requested from the site passed in via /// --filter-similar-to fn should_filter_response(&self, response: &FeroxResponse) -> bool { - let other = FuzzyHash::new(response.text()); + match self.hash { + HashValueType::String(ref hash) => { + // original response size was over the minimum required to effectively use ssdeep + let other = FuzzyHash::new(response.text()); - if let Ok(result) = FuzzyHash::compare(&self.hash, other.to_string()) { - return result >= self.threshold; + if let Ok(result) = FuzzyHash::compare(hash, other.to_string()) { + return result >= self.threshold; + } + } + HashValueType::Vec(ref hash) => { + // original response was too small for ssdeep, so minhash was used as an alternative + let hasher = MinHasher16::new(256); + let other = hasher.create_signature(whitespace_split(response.text())); + let result = hasher.compute_similarity(hash.iter(), other.iter()); + return (result * 100.0) as u32 >= self.threshold; + } } // couldn't hash the response, don't filter diff --git a/src/filters/tests.rs b/src/filters/tests.rs index 98b7a25..b48c71b 100644 --- a/src/filters/tests.rs +++ b/src/filters/tests.rs @@ -1,6 +1,7 @@ use super::*; use ::fuzzyhash::FuzzyHash; use ::regex::Regex; +use super::similarity::HashValueType; #[test] /// simply test the default values for wildcardfilter, expect 0, 0 @@ -186,7 +187,7 @@ fn similarity_filter_is_accurate() { resp.set_text("sitting"); let mut filter = SimilarityFilter { - hash: FuzzyHash::new("kitten").to_string(), + hash: HashValueType::String(FuzzyHash::new("kitten").to_string()), threshold: 95, original_url: "".to_string(), }; @@ -195,14 +196,14 @@ fn similarity_filter_is_accurate() { assert!(!filter.should_filter_response(&resp)); resp.set_text(""); - filter.hash = String::new(); + filter.hash = HashValueType::String(String::new()); filter.threshold = 100; // two empty strings are the same, however ssdeep doesn't accept empty strings, expect false assert!(!filter.should_filter_response(&resp)); resp.set_text("some data to hash for the purposes of running a test"); - filter.hash = FuzzyHash::new("some data to hash for the purposes of running a te").to_string(); + filter.hash = HashValueType::String(FuzzyHash::new("some data to hash for the purposes of running a te").to_string()); filter.threshold = 17; assert!(filter.should_filter_response(&resp)); @@ -212,20 +213,19 @@ fn similarity_filter_is_accurate() { /// just a simple test to increase code coverage by hitting as_any and the inner value fn similarity_filter_as_any() { let filter = SimilarityFilter { - hash: String::from("stuff"), + hash: HashValueType::String(String::from("stuff")), threshold: 95, original_url: "".to_string(), }; let filter2 = SimilarityFilter { - hash: String::from("stuff"), + hash: HashValueType::String(String::from("stuff")), threshold: 95, original_url: "".to_string(), }; assert!(filter.box_eq(filter2.as_any())); - assert_eq!(filter.hash, "stuff"); assert_eq!( *filter.as_any().downcast_ref::().unwrap(), filter diff --git a/src/filters/utils.rs b/src/filters/utils.rs index e4ae318..bae4251 100644 --- a/src/filters/utils.rs +++ b/src/filters/utils.rs @@ -1,11 +1,14 @@ +use super::similarity::HashValueType; use super::FeroxFilter; use super::SimilarityFilter; use crate::event_handlers::Handles; use crate::response::FeroxResponse; use crate::utils::logged_request; -use crate::{DEFAULT_METHOD, SIMILARITY_THRESHOLD}; +use crate::{DEFAULT_METHOD, MIN_SSDEEP_SIZE, SIMILARITY_THRESHOLD}; use anyhow::Result; use fuzzyhash::FuzzyHash; +use gaoya::minhash::{MinHasher, MinHasher16}; +use gaoya::text::whitespace_split; use regex::Regex; use reqwest::Url; use std::sync::Arc; @@ -41,7 +44,14 @@ pub(crate) async fn create_similarity_filter( } // hash the response body and store the resulting hash in the filter object - let hash = FuzzyHash::new(fr.text()).to_string(); + let hash = if fr.content_length() <= MIN_SSDEEP_SIZE { + // response too small for ssdeep + let hasher = MinHasher16::new(256); + HashValueType::Vec(hasher.create_signature(whitespace_split(fr.text()))) + } else { + // size over ssdeep's minimum value + HashValueType::String(FuzzyHash::new(fr.text()).to_string()) + }; Ok(SimilarityFilter { hash, @@ -95,7 +105,7 @@ pub(crate) fn filter_lookup(filter_type: &str, filter_value: &str) -> Option { return Some(Box::new(SimilarityFilter { - hash: String::new(), + hash: HashValueType::String(String::new()), threshold: SIMILARITY_THRESHOLD, original_url: filter_value.to_string(), })); @@ -157,7 +167,7 @@ mod tests { assert_eq!( filter.as_any().downcast_ref::().unwrap(), &SimilarityFilter { - hash: String::new(), + hash: HashValueType::String(String::new()), threshold: SIMILARITY_THRESHOLD, original_url: "http://localhost".to_string() } @@ -195,7 +205,7 @@ mod tests { assert_eq!( filter, SimilarityFilter { - hash: "3:YKEpn:Yfp".to_string(), + hash: HashValueType::String("3:YKEpn:Yfp".to_string()), threshold: SIMILARITY_THRESHOLD, original_url: srv.url("/") } diff --git a/src/heuristics.rs b/src/heuristics.rs index 6c32c84..36d7e00 100644 --- a/src/heuristics.rs +++ b/src/heuristics.rs @@ -5,7 +5,6 @@ use console::style; use scraper::{Html, Selector}; use uuid::Uuid; -use crate::filters::SimilarityFilter; use crate::message::FeroxMessage; use crate::{ config::OutputLevel, diff --git a/src/lib.rs b/src/lib.rs index 8b8a90a..47d0c8d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -55,6 +55,11 @@ pub const DEFAULT_OPEN_FILE_LIMIT: u64 = 8192; /// Default value used to determine near-duplicate web pages (equivalent to 95%) pub const SIMILARITY_THRESHOLD: u32 = 95; +/// Minimum size of response body for ssdeep to produce meaningful results +/// +/// ref: https://github.com/glaslos/ssdeep/issues/17 +pub(crate) const MIN_SSDEEP_SIZE: u64 = 4096; + /// Default set of extensions to Ignore when auto-collecting extensions during scans pub(crate) const DEFAULT_IGNORED_EXTENSIONS: [&str; 38] = [ "tif", "tiff", "ico", "cur", "bmp", "webp", "svg", "png", "jpg", "jpeg", "jfif", "gif", "avif", diff --git a/src/scan_manager/tests.rs b/src/scan_manager/tests.rs index c83cb63..ea399a8 100644 --- a/src/scan_manager/tests.rs +++ b/src/scan_manager/tests.rs @@ -1,7 +1,7 @@ use super::*; use crate::filters::{ FeroxFilters, LinesFilter, RegexFilter, SimilarityFilter, SizeFilter, StatusCodeFilter, - WordsFilter, + WordsFilter,HashValueType }; use crate::{ config::{Configuration, OutputLevel}, @@ -399,7 +399,7 @@ fn feroxstates_feroxserialize_implementation() { .unwrap(); filters .push(Box::new(SimilarityFilter { - hash: "3:YKEpn:Yfp".to_string(), + hash: HashValueType::String("3:YKEpn:Yfp".to_string()), threshold: SIMILARITY_THRESHOLD, original_url: "http://localhost:12345/".to_string(), }))