fixed invalid uri exception during extraction

This commit is contained in:
epi
2022-11-16 07:09:02 -06:00
parent c9c63bebd0
commit ce7f3b79b8
3 changed files with 27 additions and 3 deletions

View File

@@ -13,6 +13,12 @@ pub(super) const LINKFINDER_REGEX: &str = r#"(?:"|')(((?:[a-zA-Z]{1,10}://|//)[^
pub(super) const ROBOTS_TXT_REGEX: &str =
r#"(?m)^ *(Allow|Disallow): *(?P<url_path>[a-zA-Z0-9._/?#@!&'()+,;%=-]+?)$"#; // multi-line (?m)
/// Regular expression to filter bad characters from extracted url paths
///
/// ref: https://www.rfc-editor.org/rfc/rfc3986#section-2
// pub(super) const URL_CHARS_REGEX: &str = r#""<>\\^`\{|\} \t\r\n\x0b\x0c"#;
pub(super) const URL_CHARS_REGEX: &str = r#"["<>\\^`{|} ]"#;
/// Which type of extraction should be performed
#[derive(Debug, Copy, Clone)]
pub enum ExtractionTarget {
@@ -90,6 +96,7 @@ impl<'a> ExtractorBuilder<'a> {
Ok(Extractor {
links_regex: Regex::new(LINKFINDER_REGEX).unwrap(),
robots_regex: Regex::new(ROBOTS_TXT_REGEX).unwrap(),
url_regex: Regex::new(URL_CHARS_REGEX).unwrap(),
response: if self.response.is_some() {
Some(self.response.unwrap())
} else {

View File

@@ -17,7 +17,7 @@ use crate::{
use anyhow::{bail, Context, Result};
use reqwest::{Client, StatusCode, Url};
use scraper::{Html, Selector};
use std::collections::HashSet;
use std::{borrow::Cow, collections::HashSet};
/// Whether an active scan is recursive or not
#[derive(Debug)]
@@ -38,6 +38,9 @@ pub struct Extractor<'a> {
/// `ROBOTS_TXT_REGEX` as a regex::Regex type
pub(super) robots_regex: Regex,
/// regex to validate a url
pub(super) url_regex: Regex,
/// Response from which to extract links
pub(super) response: Option<&'a FeroxResponse>,
@@ -332,8 +335,9 @@ impl<'a> Extractor<'a> {
let normalized_path = self.normalize_url_path(path);
// filter out any empty strings caused by .split
let mut parts: Vec<&str> = normalized_path
let mut parts: Vec<Cow<_>> = normalized_path
.split('/')
.map(|s| self.url_regex.replace_all(s, ""))
.filter(|s| !s.is_empty())
.collect();
@@ -392,6 +396,17 @@ impl<'a> Extractor<'a> {
.join(link)
.with_context(|| format!("Could not join {} with {}", old_url, link))?;
if old_url.domain() != new_url.domain() || old_url.host() != old_url.host() {
// domains/ips are not the same, don't scan things that aren't part of the original
// target url
log::debug!(
"Skipping {} because it's not part of the original target",
new_url
);
log::trace!("exit: add_link_to_set_of_links");
return Ok(());
}
links.insert(new_url.to_string());
log::trace!("exit: add_link_to_set_of_links");

View File

@@ -1,4 +1,4 @@
use super::builder::{LINKFINDER_REGEX, ROBOTS_TXT_REGEX};
use super::builder::{LINKFINDER_REGEX, ROBOTS_TXT_REGEX, URL_CHARS_REGEX};
use super::*;
use crate::config::{Configuration, OutputLevel};
use crate::scan_manager::ScanOrder;
@@ -273,6 +273,7 @@ async fn extractor_get_links_with_absolute_url_that_differs_from_target_domain()
let extractor = Extractor {
links_regex: Regex::new(LINKFINDER_REGEX).unwrap(),
robots_regex: Regex::new(ROBOTS_TXT_REGEX).unwrap(),
url_regex: Regex::new(URL_CHARS_REGEX).unwrap(),
response: Some(&ferox_response),
url: String::new(),
target: ExtractionTarget::ResponseBody,
@@ -301,6 +302,7 @@ async fn request_robots_txt_without_proxy() -> Result<()> {
let extractor = Extractor {
links_regex: Regex::new(LINKFINDER_REGEX).unwrap(),
robots_regex: Regex::new(ROBOTS_TXT_REGEX).unwrap(),
url_regex: Regex::new(URL_CHARS_REGEX).unwrap(),
response: None,
url: srv.url("/api/users/stuff/things"),
target: ExtractionTarget::RobotsTxt,