added minhash algo when resp too short for ssdeep

This commit is contained in:
epi
2023-02-25 06:20:30 -06:00
parent 3531b8c74b
commit 96375e7734
7 changed files with 66 additions and 19 deletions

View File

@@ -11,7 +11,7 @@ pub(crate) use self::empty::EmptyFilter;
pub use self::init::initialize;
pub use self::lines::LinesFilter;
pub use self::regex::RegexFilter;
pub use self::similarity::SimilarityFilter;
pub use self::similarity::{SimilarityFilter, HashValueType};
pub use self::size::SizeFilter;
pub use self::status_code::StatusCodeFilter;
pub(crate) use self::utils::{create_similarity_filter, filter_lookup};

View File

@@ -1,12 +1,33 @@
use std::hash::BuildHasherDefault;
use super::*;
use crate::MIN_SSDEEP_SIZE;
use fuzzyhash::FuzzyHash;
use gaoya::minhash::{MinHash, MinHasher, MinHasher16};
use gaoya::text::whitespace_split;
/// enum wrapper for two distinct hashing signature types
#[derive(Debug, PartialEq, Eq, Serialize, Deserialize)]
pub(crate) enum HashValueType {
/// String value for FuzzyHash
String(String),
/// Vec<u16> value for minhash
Vec(Vec<u16>),
}
impl Default for HashValueType {
fn default() -> Self {
Self::String(String::new())
}
}
/// Simple implementor of FeroxFilter; used to filter out responses based on the similarity of a
/// Response body with a known response; specified using --filter-similar-to
#[derive(Default, Debug, PartialEq, Eq, Serialize, Deserialize)]
pub struct SimilarityFilter {
/// Hash of Response's body to be used during similarity comparison
pub hash: String,
pub hash: HashValueType,
/// Percentage of similarity at which a page is determined to be a near-duplicate of another
pub threshold: u32,
@@ -20,10 +41,22 @@ impl FeroxFilter for SimilarityFilter {
/// Check `FeroxResponse::text` against what was requested from the site passed in via
/// --filter-similar-to
fn should_filter_response(&self, response: &FeroxResponse) -> bool {
let other = FuzzyHash::new(response.text());
match self.hash {
HashValueType::String(ref hash) => {
// original response size was over the minimum required to effectively use ssdeep
let other = FuzzyHash::new(response.text());
if let Ok(result) = FuzzyHash::compare(&self.hash, other.to_string()) {
return result >= self.threshold;
if let Ok(result) = FuzzyHash::compare(hash, other.to_string()) {
return result >= self.threshold;
}
}
HashValueType::Vec(ref hash) => {
// original response was too small for ssdeep, so minhash was used as an alternative
let hasher = MinHasher16::new(256);
let other = hasher.create_signature(whitespace_split(response.text()));
let result = hasher.compute_similarity(hash.iter(), other.iter());
return (result * 100.0) as u32 >= self.threshold;
}
}
// couldn't hash the response, don't filter

View File

@@ -1,6 +1,7 @@
use super::*;
use ::fuzzyhash::FuzzyHash;
use ::regex::Regex;
use super::similarity::HashValueType;
#[test]
/// simply test the default values for wildcardfilter, expect 0, 0
@@ -186,7 +187,7 @@ fn similarity_filter_is_accurate() {
resp.set_text("sitting");
let mut filter = SimilarityFilter {
hash: FuzzyHash::new("kitten").to_string(),
hash: HashValueType::String(FuzzyHash::new("kitten").to_string()),
threshold: 95,
original_url: "".to_string(),
};
@@ -195,14 +196,14 @@ fn similarity_filter_is_accurate() {
assert!(!filter.should_filter_response(&resp));
resp.set_text("");
filter.hash = String::new();
filter.hash = HashValueType::String(String::new());
filter.threshold = 100;
// two empty strings are the same, however ssdeep doesn't accept empty strings, expect false
assert!(!filter.should_filter_response(&resp));
resp.set_text("some data to hash for the purposes of running a test");
filter.hash = FuzzyHash::new("some data to hash for the purposes of running a te").to_string();
filter.hash = HashValueType::String(FuzzyHash::new("some data to hash for the purposes of running a te").to_string());
filter.threshold = 17;
assert!(filter.should_filter_response(&resp));
@@ -212,20 +213,19 @@ fn similarity_filter_is_accurate() {
/// just a simple test to increase code coverage by hitting as_any and the inner value
fn similarity_filter_as_any() {
let filter = SimilarityFilter {
hash: String::from("stuff"),
hash: HashValueType::String(String::from("stuff")),
threshold: 95,
original_url: "".to_string(),
};
let filter2 = SimilarityFilter {
hash: String::from("stuff"),
hash: HashValueType::String(String::from("stuff")),
threshold: 95,
original_url: "".to_string(),
};
assert!(filter.box_eq(filter2.as_any()));
assert_eq!(filter.hash, "stuff");
assert_eq!(
*filter.as_any().downcast_ref::<SimilarityFilter>().unwrap(),
filter

View File

@@ -1,11 +1,14 @@
use super::similarity::HashValueType;
use super::FeroxFilter;
use super::SimilarityFilter;
use crate::event_handlers::Handles;
use crate::response::FeroxResponse;
use crate::utils::logged_request;
use crate::{DEFAULT_METHOD, SIMILARITY_THRESHOLD};
use crate::{DEFAULT_METHOD, MIN_SSDEEP_SIZE, SIMILARITY_THRESHOLD};
use anyhow::Result;
use fuzzyhash::FuzzyHash;
use gaoya::minhash::{MinHasher, MinHasher16};
use gaoya::text::whitespace_split;
use regex::Regex;
use reqwest::Url;
use std::sync::Arc;
@@ -41,7 +44,14 @@ pub(crate) async fn create_similarity_filter(
}
// hash the response body and store the resulting hash in the filter object
let hash = FuzzyHash::new(fr.text()).to_string();
let hash = if fr.content_length() <= MIN_SSDEEP_SIZE {
// response too small for ssdeep
let hasher = MinHasher16::new(256);
HashValueType::Vec(hasher.create_signature(whitespace_split(fr.text())))
} else {
// size over ssdeep's minimum value
HashValueType::String(FuzzyHash::new(fr.text()).to_string())
};
Ok(SimilarityFilter {
hash,
@@ -95,7 +105,7 @@ pub(crate) fn filter_lookup(filter_type: &str, filter_value: &str) -> Option<Box
}
"similarity" => {
return Some(Box::new(SimilarityFilter {
hash: String::new(),
hash: HashValueType::String(String::new()),
threshold: SIMILARITY_THRESHOLD,
original_url: filter_value.to_string(),
}));
@@ -157,7 +167,7 @@ mod tests {
assert_eq!(
filter.as_any().downcast_ref::<SimilarityFilter>().unwrap(),
&SimilarityFilter {
hash: String::new(),
hash: HashValueType::String(String::new()),
threshold: SIMILARITY_THRESHOLD,
original_url: "http://localhost".to_string()
}
@@ -195,7 +205,7 @@ mod tests {
assert_eq!(
filter,
SimilarityFilter {
hash: "3:YKEpn:Yfp".to_string(),
hash: HashValueType::String("3:YKEpn:Yfp".to_string()),
threshold: SIMILARITY_THRESHOLD,
original_url: srv.url("/")
}

View File

@@ -5,7 +5,6 @@ use console::style;
use scraper::{Html, Selector};
use uuid::Uuid;
use crate::filters::SimilarityFilter;
use crate::message::FeroxMessage;
use crate::{
config::OutputLevel,

View File

@@ -55,6 +55,11 @@ pub const DEFAULT_OPEN_FILE_LIMIT: u64 = 8192;
/// Default value used to determine near-duplicate web pages (equivalent to 95%)
pub const SIMILARITY_THRESHOLD: u32 = 95;
/// Minimum size of response body for ssdeep to produce meaningful results
///
/// ref: https://github.com/glaslos/ssdeep/issues/17
pub(crate) const MIN_SSDEEP_SIZE: u64 = 4096;
/// Default set of extensions to Ignore when auto-collecting extensions during scans
pub(crate) const DEFAULT_IGNORED_EXTENSIONS: [&str; 38] = [
"tif", "tiff", "ico", "cur", "bmp", "webp", "svg", "png", "jpg", "jpeg", "jfif", "gif", "avif",

View File

@@ -1,7 +1,7 @@
use super::*;
use crate::filters::{
FeroxFilters, LinesFilter, RegexFilter, SimilarityFilter, SizeFilter, StatusCodeFilter,
WordsFilter,
WordsFilter,HashValueType
};
use crate::{
config::{Configuration, OutputLevel},
@@ -399,7 +399,7 @@ fn feroxstates_feroxserialize_implementation() {
.unwrap();
filters
.push(Box::new(SimilarityFilter {
hash: "3:YKEpn:Yfp".to_string(),
hash: HashValueType::String("3:YKEpn:Yfp".to_string()),
threshold: SIMILARITY_THRESHOLD,
original_url: "http://localhost:12345/".to_string(),
}))