mirror of
https://github.com/epi052/feroxbuster.git
synced 2026-06-09 12:11:16 -03:00
fixed invalid uri exception during extraction
This commit is contained in:
@@ -13,6 +13,12 @@ pub(super) const LINKFINDER_REGEX: &str = r#"(?:"|')(((?:[a-zA-Z]{1,10}://|//)[^
|
||||
pub(super) const ROBOTS_TXT_REGEX: &str =
|
||||
r#"(?m)^ *(Allow|Disallow): *(?P<url_path>[a-zA-Z0-9._/?#@!&'()+,;%=-]+?)$"#; // multi-line (?m)
|
||||
|
||||
/// Regular expression to filter bad characters from extracted url paths
|
||||
///
|
||||
/// ref: https://www.rfc-editor.org/rfc/rfc3986#section-2
|
||||
// pub(super) const URL_CHARS_REGEX: &str = r#""<>\\^`\{|\} \t\r\n\x0b\x0c"#;
|
||||
pub(super) const URL_CHARS_REGEX: &str = r#"["<>\\^`{|} ]"#;
|
||||
|
||||
/// Which type of extraction should be performed
|
||||
#[derive(Debug, Copy, Clone)]
|
||||
pub enum ExtractionTarget {
|
||||
@@ -90,6 +96,7 @@ impl<'a> ExtractorBuilder<'a> {
|
||||
Ok(Extractor {
|
||||
links_regex: Regex::new(LINKFINDER_REGEX).unwrap(),
|
||||
robots_regex: Regex::new(ROBOTS_TXT_REGEX).unwrap(),
|
||||
url_regex: Regex::new(URL_CHARS_REGEX).unwrap(),
|
||||
response: if self.response.is_some() {
|
||||
Some(self.response.unwrap())
|
||||
} else {
|
||||
|
||||
@@ -17,7 +17,7 @@ use crate::{
|
||||
use anyhow::{bail, Context, Result};
|
||||
use reqwest::{Client, StatusCode, Url};
|
||||
use scraper::{Html, Selector};
|
||||
use std::collections::HashSet;
|
||||
use std::{borrow::Cow, collections::HashSet};
|
||||
|
||||
/// Whether an active scan is recursive or not
|
||||
#[derive(Debug)]
|
||||
@@ -38,6 +38,9 @@ pub struct Extractor<'a> {
|
||||
/// `ROBOTS_TXT_REGEX` as a regex::Regex type
|
||||
pub(super) robots_regex: Regex,
|
||||
|
||||
/// regex to validate a url
|
||||
pub(super) url_regex: Regex,
|
||||
|
||||
/// Response from which to extract links
|
||||
pub(super) response: Option<&'a FeroxResponse>,
|
||||
|
||||
@@ -332,8 +335,9 @@ impl<'a> Extractor<'a> {
|
||||
let normalized_path = self.normalize_url_path(path);
|
||||
|
||||
// filter out any empty strings caused by .split
|
||||
let mut parts: Vec<&str> = normalized_path
|
||||
let mut parts: Vec<Cow<_>> = normalized_path
|
||||
.split('/')
|
||||
.map(|s| self.url_regex.replace_all(s, ""))
|
||||
.filter(|s| !s.is_empty())
|
||||
.collect();
|
||||
|
||||
@@ -392,6 +396,17 @@ impl<'a> Extractor<'a> {
|
||||
.join(link)
|
||||
.with_context(|| format!("Could not join {} with {}", old_url, link))?;
|
||||
|
||||
if old_url.domain() != new_url.domain() || old_url.host() != old_url.host() {
|
||||
// domains/ips are not the same, don't scan things that aren't part of the original
|
||||
// target url
|
||||
log::debug!(
|
||||
"Skipping {} because it's not part of the original target",
|
||||
new_url
|
||||
);
|
||||
log::trace!("exit: add_link_to_set_of_links");
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
links.insert(new_url.to_string());
|
||||
|
||||
log::trace!("exit: add_link_to_set_of_links");
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use super::builder::{LINKFINDER_REGEX, ROBOTS_TXT_REGEX};
|
||||
use super::builder::{LINKFINDER_REGEX, ROBOTS_TXT_REGEX, URL_CHARS_REGEX};
|
||||
use super::*;
|
||||
use crate::config::{Configuration, OutputLevel};
|
||||
use crate::scan_manager::ScanOrder;
|
||||
@@ -273,6 +273,7 @@ async fn extractor_get_links_with_absolute_url_that_differs_from_target_domain()
|
||||
let extractor = Extractor {
|
||||
links_regex: Regex::new(LINKFINDER_REGEX).unwrap(),
|
||||
robots_regex: Regex::new(ROBOTS_TXT_REGEX).unwrap(),
|
||||
url_regex: Regex::new(URL_CHARS_REGEX).unwrap(),
|
||||
response: Some(&ferox_response),
|
||||
url: String::new(),
|
||||
target: ExtractionTarget::ResponseBody,
|
||||
@@ -301,6 +302,7 @@ async fn request_robots_txt_without_proxy() -> Result<()> {
|
||||
let extractor = Extractor {
|
||||
links_regex: Regex::new(LINKFINDER_REGEX).unwrap(),
|
||||
robots_regex: Regex::new(ROBOTS_TXT_REGEX).unwrap(),
|
||||
url_regex: Regex::new(URL_CHARS_REGEX).unwrap(),
|
||||
response: None,
|
||||
url: srv.url("/api/users/stuff/things"),
|
||||
target: ExtractionTarget::RobotsTxt,
|
||||
|
||||
Reference in New Issue
Block a user