fixed invalid uri exception during extraction

2026-06-09 12:11:16 -03:00 · 2022-11-16 07:09:02 -06:00
parent c9c63bebd0
commit ce7f3b79b8
3 changed files with 27 additions and 3 deletions
--- a/src/extractor/builder.rs
+++ b/src/extractor/builder.rs
@@ -13,6 +13,12 @@ pub(super) const LINKFINDER_REGEX: &str = r#"(?:"|')(((?:[a-zA-Z]{1,10}://|//)[^
 pub(super) const ROBOTS_TXT_REGEX: &str =
    r#"(?m)^ *(Allow|Disallow): *(?P<url_path>[a-zA-Z0-9._/?#@!&'()+,;%=-]+?)$"#; // multi-line (?m)

+/// Regular expression to filter bad characters from extracted url paths
+///
+/// ref: https://www.rfc-editor.org/rfc/rfc3986#section-2
+// pub(super) const URL_CHARS_REGEX: &str = r#""<>\\^`\{|\} \t\r\n\x0b\x0c"#;
+pub(super) const URL_CHARS_REGEX: &str = r#"["<>\\^`{|} ]"#;
+
 /// Which type of extraction should be performed
 #[derive(Debug, Copy, Clone)]
 pub enum ExtractionTarget {
@@ -90,6 +96,7 @@ impl<'a> ExtractorBuilder<'a> {
        Ok(Extractor {
            links_regex: Regex::new(LINKFINDER_REGEX).unwrap(),
            robots_regex: Regex::new(ROBOTS_TXT_REGEX).unwrap(),
+            url_regex: Regex::new(URL_CHARS_REGEX).unwrap(),
            response: if self.response.is_some() {
                Some(self.response.unwrap())
            } else {
--- a/src/extractor/container.rs
+++ b/src/extractor/container.rs
@@ -17,7 +17,7 @@ use crate::{
 use anyhow::{bail, Context, Result};
 use reqwest::{Client, StatusCode, Url};
 use scraper::{Html, Selector};
-use std::collections::HashSet;
+use std::{borrow::Cow, collections::HashSet};

 /// Whether an active scan is recursive or not
 #[derive(Debug)]
@@ -38,6 +38,9 @@ pub struct Extractor<'a> {
    /// `ROBOTS_TXT_REGEX` as a regex::Regex type
    pub(super) robots_regex: Regex,

+    /// regex to validate a url
+    pub(super) url_regex: Regex,
+
    /// Response from which to extract links
    pub(super) response: Option<&'a FeroxResponse>,

@@ -332,8 +335,9 @@ impl<'a> Extractor<'a> {
        let normalized_path = self.normalize_url_path(path);

        // filter out any empty strings caused by .split
-        let mut parts: Vec<&str> = normalized_path
+        let mut parts: Vec<Cow<_>> = normalized_path
            .split('/')
+            .map(|s| self.url_regex.replace_all(s, ""))
            .filter(|s| !s.is_empty())
            .collect();

@@ -392,6 +396,17 @@ impl<'a> Extractor<'a> {
            .join(link)
            .with_context(|| format!("Could not join {} with {}", old_url, link))?;

+        if old_url.domain() != new_url.domain() || old_url.host() != old_url.host() {
+            // domains/ips are not the same, don't scan things that aren't part of the original
+            // target url
+            log::debug!(
+                "Skipping {} because it's not part of the original target",
+                new_url
+            );
+            log::trace!("exit: add_link_to_set_of_links");
+            return Ok(());
+        }
+
        links.insert(new_url.to_string());

        log::trace!("exit: add_link_to_set_of_links");
--- a/src/extractor/tests.rs
+++ b/src/extractor/tests.rs
@@ -1,4 +1,4 @@
-use super::builder::{LINKFINDER_REGEX, ROBOTS_TXT_REGEX};
+use super::builder::{LINKFINDER_REGEX, ROBOTS_TXT_REGEX, URL_CHARS_REGEX};
 use super::*;
 use crate::config::{Configuration, OutputLevel};
 use crate::scan_manager::ScanOrder;
@@ -273,6 +273,7 @@ async fn extractor_get_links_with_absolute_url_that_differs_from_target_domain()
    let extractor = Extractor {
        links_regex: Regex::new(LINKFINDER_REGEX).unwrap(),
        robots_regex: Regex::new(ROBOTS_TXT_REGEX).unwrap(),
+        url_regex: Regex::new(URL_CHARS_REGEX).unwrap(),
        response: Some(&ferox_response),
        url: String::new(),
        target: ExtractionTarget::ResponseBody,
@@ -301,6 +302,7 @@ async fn request_robots_txt_without_proxy() -> Result<()> {
    let extractor = Extractor {
        links_regex: Regex::new(LINKFINDER_REGEX).unwrap(),
        robots_regex: Regex::new(ROBOTS_TXT_REGEX).unwrap(),
+        url_regex: Regex::new(URL_CHARS_REGEX).unwrap(),
        response: None,
        url: srv.url("/api/users/stuff/things"),
        target: ExtractionTarget::RobotsTxt,