extractor restructure mostly done

2026-05-27 16:51:13 -03:00 · 2021-01-16 08:07:38 -06:00
parent 4b2af18ae2
commit 269ae86201
11 changed files with 1050 additions and 630 deletions
--- a/src/extractor.rs
+++ b/src/extractor.rs
@@ -1,504 +0,0 @@
-use crate::{
-    client,
-    config::{Configuration, CONFIGURATION},
-    scanner::SCANNED_URLS,
-    statistics::{
-        StatCommand::{self, UpdateUsizeField},
-        StatField::{LinksExtracted, TotalExpected},
-    },
-    utils::{format_url, make_request},
-    FeroxResponse,
-};
-use lazy_static::lazy_static;
-use regex::Regex;
-use reqwest::Url;
-use std::collections::HashSet;
-use tokio::sync::mpsc::UnboundedSender;
-
-/// Regular expression used in [LinkFinder](https://github.com/GerbenJavado/LinkFinder)
-///
-/// Incorporates change from this [Pull Request](https://github.com/GerbenJavado/LinkFinder/pull/66/files)
-const LINKFINDER_REGEX: &str = r#"(?:"|')(((?:[a-zA-Z]{1,10}://|//)[^"'/]{1,}\.[a-zA-Z]{2,}[^"']{0,})|((?:/|\.\./|\./)[^"'><,;| *()(%%$^/\\\[\]][^"'><,;|()]{1,})|([a-zA-Z0-9_\-/]{1,}/[a-zA-Z0-9_\-/]{1,}\.(?:[a-zA-Z]{1,4}|action)(?:[\?|#][^"|']{0,}|))|([a-zA-Z0-9_\-/]{1,}/[a-zA-Z0-9_\-/]{3,}(?:[\?|#][^"|']{0,}|))|([a-zA-Z0-9_\-.]{1,}\.(?:php|asp|aspx|jsp|json|action|html|js|txt|xml)(?:[\?|#][^"|']{0,}|)))(?:"|')"#;
-
-/// Regular expression to pull url paths from robots.txt
-///
-/// ref: https://developers.google.com/search/reference/robots_txt
-const ROBOTS_TXT_REGEX: &str =
-    r#"(?m)^ *(Allow|Disallow): *(?P<url_path>[a-zA-Z0-9._/?#@!&'()+,;%=-]+?)$"#; // multi-line (?m)
-
-lazy_static! {
-    /// `LINKFINDER_REGEX` as a regex::Regex type
-    static ref LINKS_REGEX: Regex = Regex::new(LINKFINDER_REGEX).unwrap();
-
-    /// `ROBOTS_TXT_REGEX` as a regex::Regex type
-    static ref ROBOTS_REGEX: Regex = Regex::new(ROBOTS_TXT_REGEX).unwrap();
-}
-
-/// Iterate over a given path, return a list of every sub-path found
-///
-/// example: `path` contains a link fragment `homepage/assets/img/icons/handshake.svg`
-/// the following fragments would be returned:
-///   - homepage/assets/img/icons/handshake.svg
-///   - homepage/assets/img/icons/
-///   - homepage/assets/img/
-///   - homepage/assets/
-///   - homepage/
-fn get_sub_paths_from_path(path: &str) -> Vec<String> {
-    log::trace!("enter: get_sub_paths_from_path({})", path);
-    let mut paths = vec![];
-
-    // filter out any empty strings caused by .split
-    let mut parts: Vec<&str> = path.split('/').filter(|s| !s.is_empty()).collect();
-
-    let length = parts.len();
-
-    for i in 0..length {
-        // iterate over all parts of the path
-        if parts.is_empty() {
-            // pop left us with an empty vector, we're done
-            break;
-        }
-
-        let mut possible_path = parts.join("/");
-
-        if possible_path.is_empty() {
-            // .join can result in an empty string, which we don't need, ignore
-            continue;
-        }
-
-        if i > 0 {
-            // this isn't the last index of the parts array
-            // ex: /buried/misc/stupidfile.php
-            // this block skips the file but sees all parent folders
-            possible_path = format!("{}/", possible_path);
-        }
-
-        paths.push(possible_path); // good sub-path found
-        parts.pop(); // use .pop() to remove the last part of the path and continue iteration
-    }
-
-    log::trace!("exit: get_sub_paths_from_path -> {:?}", paths);
-    paths
-}
-
-/// simple helper to stay DRY, trys to join a url + fragment and add it to the `links` HashSet
-fn add_link_to_set_of_links(link: &str, url: &Url, links: &mut HashSet<String>) {
-    log::trace!(
-        "enter: add_link_to_set_of_links({}, {}, {:?})",
-        link,
-        url.to_string(),
-        links
-    );
-    match url.join(&link) {
-        Ok(new_url) => {
-            links.insert(new_url.to_string());
-        }
-        Err(e) => {
-            log::error!("Could not join given url to the base url: {}", e);
-        }
-    }
-    log::trace!("exit: add_link_to_set_of_links");
-}
-
-/// Given a `reqwest::Response`, perform the following actions
-///   - parse the response's text for links using the linkfinder regex
-///   - for every link found take its url path and parse each sub-path
-///     - example: Response contains a link fragment `homepage/assets/img/icons/handshake.svg`
-///       with a base url of http://localhost, the following urls would be returned:
-///         - homepage/assets/img/icons/handshake.svg
-///         - homepage/assets/img/icons/
-///         - homepage/assets/img/
-///         - homepage/assets/
-///         - homepage/
-pub async fn get_links(
-    response: &FeroxResponse,
-    tx_stats: UnboundedSender<StatCommand>,
-) -> HashSet<String> {
-    log::trace!(
-        "enter: get_links({}, {:?})",
-        response.url().as_str(),
-        tx_stats
-    );
-
-    let mut links = HashSet::<String>::new();
-
-    let body = response.text();
-
-    for capture in LINKS_REGEX.captures_iter(&body) {
-        // remove single & double quotes from both ends of the capture
-        // capture[0] is the entire match, additional capture groups start at [1]
-        let link = capture[0].trim_matches(|c| c == '\'' || c == '"');
-
-        match Url::parse(link) {
-            Ok(absolute) => {
-                if absolute.domain() != response.url().domain()
-                    || absolute.host() != response.url().host()
-                {
-                    // domains/ips are not the same, don't scan things that aren't part of the original
-                    // target url
-                    continue;
-                }
-
-                add_all_sub_paths(absolute.path(), &response, &mut links);
-            }
-            Err(e) => {
-                // this is the expected error that happens when we try to parse a url fragment
-                //     ex: Url::parse("/login") -> Err("relative URL without a base")
-                // while this is technically an error, these are good results for us
-                if e.to_string().contains("relative URL without a base") {
-                    add_all_sub_paths(link, &response, &mut links);
-                } else {
-                    // unexpected error has occurred
-                    log::error!("Could not parse given url: {}", e);
-                }
-            }
-        }
-    }
-
-    let multiplier = CONFIGURATION.extensions.len().max(1);
-
-    update_stat!(tx_stats, UpdateUsizeField(LinksExtracted, links.len()));
-    update_stat!(
-        tx_stats,
-        UpdateUsizeField(TotalExpected, links.len() * multiplier)
-    );
-
-    log::trace!("exit: get_links -> {:?}", links);
-
-    links
-}
-
-/// take a url fragment like homepage/assets/img/icons/handshake.svg and
-/// incrementally add
-///     - homepage/assets/img/icons/
-///     - homepage/assets/img/
-///     - homepage/assets/
-///     - homepage/
-fn add_all_sub_paths(url_path: &str, response: &FeroxResponse, mut links: &mut HashSet<String>) {
-    log::trace!(
-        "enter: add_all_sub_paths({}, {}, {:?})",
-        url_path,
-        response,
-        links
-    );
-
-    for sub_path in get_sub_paths_from_path(url_path) {
-        log::debug!("Adding {} to {:?}", sub_path, links);
-        add_link_to_set_of_links(&sub_path, &response.url(), &mut links);
-    }
-
-    log::trace!("exit: add_all_sub_paths");
-}
-
-/// Wrapper around link extraction logic
-/// currently used in two places:
-///   - links from response bodys
-///   - links from robots.txt responses
-///
-/// general steps taken:
-///   - create a new Url object based on cli options/args
-///   - check if the new Url has already been seen/scanned -> None
-///   - make a request to the new Url ? -> Some(response) : None
-pub async fn request_feroxresponse_from_new_link(
-    url: &str,
-    tx_stats: UnboundedSender<StatCommand>,
-) -> Option<FeroxResponse> {
-    log::trace!(
-        "enter: request_feroxresponse_from_new_link({}, {:?})",
-        url,
-        tx_stats
-    );
-
-    // create a url based on the given command line options, return None on error
-    let new_url = match format_url(
-        &url,
-        &"",
-        CONFIGURATION.add_slash,
-        &CONFIGURATION.queries,
-        None,
-        tx_stats.clone(),
-    ) {
-        Ok(url) => url,
-        Err(_) => {
-            log::trace!("exit: request_feroxresponse_from_new_link -> None");
-            return None;
-        }
-    };
-
-    if SCANNED_URLS.get_scan_by_url(&new_url.to_string()).is_some() {
-        //we've seen the url before and don't need to scan again
-        log::trace!("exit: request_feroxresponse_from_new_link -> None");
-        return None;
-    }
-
-    // make the request and store the response
-    let new_response = match make_request(&CONFIGURATION.client, &new_url, tx_stats).await {
-        Ok(resp) => resp,
-        Err(_) => {
-            log::trace!("exit: request_feroxresponse_from_new_link -> None");
-            return None;
-        }
-    };
-
-    let new_ferox_response = FeroxResponse::from(new_response, true).await;
-
-    log::trace!(
-        "exit: request_feroxresponse_from_new_link -> {:?}",
-        new_ferox_response
-    );
-    Some(new_ferox_response)
-}
-
-/// helper function that simply requests /robots.txt on the given url's base url
-///
-/// example:
-///     http://localhost/api/users -> http://localhost/robots.txt
-///     
-/// The length of the given path has no effect on what's requested; it's always
-/// base url + /robots.txt
-pub async fn request_robots_txt(
-    base_url: &str,
-    config: &Configuration,
-    tx_stats: UnboundedSender<StatCommand>,
-) -> Option<FeroxResponse> {
-    log::trace!(
-        "enter: get_robots_file({}, CONFIGURATION, {:?})",
-        base_url,
-        tx_stats
-    );
-
-    // more often than not, domain/robots.txt will redirect to www.domain/robots.txt or something
-    // similar; to account for that, create a client that will follow redirects, regardless of
-    // what the user specified for the scanning client. Other than redirects, it will respect
-    // all other user specified settings
-    let follow_redirects = true;
-
-    let proxy = if config.proxy.is_empty() {
-        None
-    } else {
-        Some(config.proxy.as_str())
-    };
-
-    let client = client::initialize(
-        config.timeout,
-        &config.user_agent,
-        follow_redirects,
-        config.insecure,
-        &config.headers,
-        proxy,
-    );
-
-    if let Ok(mut url) = Url::parse(base_url) {
-        url.set_path("/robots.txt"); // overwrite existing path with /robots.txt
-
-        if let Ok(response) = make_request(&client, &url, tx_stats).await {
-            let ferox_response = FeroxResponse::from(response, true).await;
-
-            log::trace!("exit: get_robots_file -> {}", ferox_response);
-            return Some(ferox_response);
-        }
-    }
-
-    None
-}
-
-/// Entry point to perform link extraction from robots.txt
-///
-/// `base_url` can have paths and subpaths, however robots.txt will be requested from the
-/// root of the url
-/// given the url:
-///     http://localhost/stuff/things
-/// this function requests:
-///     http://localhost/robots.txt
-pub async fn extract_robots_txt(
-    base_url: &str,
-    config: &Configuration,
-    tx_stats: UnboundedSender<StatCommand>,
-) -> HashSet<String> {
-    log::trace!(
-        "enter: extract_robots_txt({}, CONFIGURATION, {:?})",
-        base_url,
-        tx_stats
-    );
-    let mut links = HashSet::new();
-
-    if let Some(response) = request_robots_txt(&base_url, &config, tx_stats.clone()).await {
-        for capture in ROBOTS_REGEX.captures_iter(response.text.as_str()) {
-            if let Some(new_path) = capture.name("url_path") {
-                if let Ok(mut new_url) = Url::parse(base_url) {
-                    new_url.set_path(new_path.as_str());
-                    add_all_sub_paths(new_url.path(), &response, &mut links);
-                }
-            }
-        }
-    }
-
-    let multiplier = CONFIGURATION.extensions.len().max(1);
-
-    update_stat!(tx_stats, UpdateUsizeField(LinksExtracted, links.len()));
-    update_stat!(
-        tx_stats,
-        UpdateUsizeField(TotalExpected, links.len() * multiplier)
-    );
-
-    log::trace!("exit: extract_robots_txt -> {:?}", links);
-    links
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use crate::utils::make_request;
-    use crate::FeroxChannel;
-    use httpmock::Method::GET;
-    use httpmock::MockServer;
-    use reqwest::Client;
-    use tokio::sync::mpsc;
-
-    #[test]
-    /// extract sub paths from the given url fragment; expect 4 sub paths and that all are
-    /// in the expected array
-    fn extractor_get_sub_paths_from_path_with_multiple_paths() {
-        let path = "homepage/assets/img/icons/handshake.svg";
-        let paths = get_sub_paths_from_path(&path);
-        let expected = vec![
-            "homepage/",
-            "homepage/assets/",
-            "homepage/assets/img/",
-            "homepage/assets/img/icons/",
-            "homepage/assets/img/icons/handshake.svg",
-        ];
-
-        assert_eq!(paths.len(), expected.len());
-        for expected_path in expected {
-            assert_eq!(paths.contains(&expected_path.to_string()), true);
-        }
-    }
-
-    #[test]
-    /// extract sub paths from the given url fragment; expect 2 sub paths and that all are
-    /// in the expected array. the fragment is wrapped in slashes to ensure no empty strings are
-    /// returned
-    fn extractor_get_sub_paths_from_path_with_enclosing_slashes() {
-        let path = "/homepage/assets/";
-        let paths = get_sub_paths_from_path(&path);
-        let expected = vec!["homepage/", "homepage/assets"];
-
-        assert_eq!(paths.len(), expected.len());
-        for expected_path in expected {
-            assert_eq!(paths.contains(&expected_path.to_string()), true);
-        }
-    }
-
-    #[test]
-    /// extract sub paths from the given url fragment; expect 1 sub path, no forward slashes are
-    /// included
-    fn extractor_get_sub_paths_from_path_with_only_a_word() {
-        let path = "homepage";
-        let paths = get_sub_paths_from_path(&path);
-        let expected = vec!["homepage"];
-
-        assert_eq!(paths.len(), expected.len());
-        for expected_path in expected {
-            assert_eq!(paths.contains(&expected_path.to_string()), true);
-        }
-    }
-
-    #[test]
-    /// extract sub paths from the given url fragment; expect 1 sub path, forward slash removed
-    fn extractor_get_sub_paths_from_path_with_an_absolute_word() {
-        let path = "/homepage";
-        let paths = get_sub_paths_from_path(&path);
-        let expected = vec!["homepage"];
-
-        assert_eq!(paths.len(), expected.len());
-        for expected_path in expected {
-            assert_eq!(paths.contains(&expected_path.to_string()), true);
-        }
-    }
-
-    #[test]
-    /// test that a full url and fragment are joined correctly, then added to the given list
-    /// i.e. the happy path
-    fn extractor_add_link_to_set_of_links_happy_path() {
-        let url = Url::parse("https://localhost").unwrap();
-        let mut links = HashSet::<String>::new();
-        let link = "admin";
-
-        assert_eq!(links.len(), 0);
-        add_link_to_set_of_links(link, &url, &mut links);
-
-        assert_eq!(links.len(), 1);
-        assert!(links.contains("https://localhost/admin"));
-    }
-
-    #[test]
-    /// test that an invalid path fragment doesn't add anything to the set of links
-    fn extractor_add_link_to_set_of_links_with_non_base_url() {
-        let url = Url::parse("https://localhost").unwrap();
-        let mut links = HashSet::<String>::new();
-        let link = "\\\\\\\\";
-
-        assert_eq!(links.len(), 0);
-        add_link_to_set_of_links(link, &url, &mut links);
-
-        assert_eq!(links.len(), 0);
-        assert!(links.is_empty());
-    }
-
-    #[tokio::test(flavor = "multi_thread", worker_threads = 1)]
-    /// use make_request to generate a Response, and use the Response to test get_links;
-    /// the response will contain an absolute path to a domain that is not part of the scanned
-    /// domain; expect an empty set returned
-    async fn extractor_get_links_with_absolute_url_that_differs_from_target_domain(
-    ) -> Result<(), Box<dyn std::error::Error>> {
-        let srv = MockServer::start();
-
-        let mock = srv.mock(|when, then|{
-            when.method(GET)
-                .path("/some-path");
-            then.status(200)
-                .body("\"http://defintely.not.a.thing.probably.com/homepage/assets/img/icons/handshake.svg\"");
-        });
-
-        let client = Client::new();
-        let url = Url::parse(&srv.url("/some-path")).unwrap();
-        let (tx, _): FeroxChannel<StatCommand> = mpsc::unbounded_channel();
-
-        let response = make_request(&client, &url, tx.clone()).await.unwrap();
-
-        let ferox_response = FeroxResponse::from(response, true).await;
-
-        let links = get_links(&ferox_response, tx).await;
-
-        assert!(links.is_empty());
-
-        assert_eq!(mock.hits(), 1);
-        Ok(())
-    }
-
-    #[tokio::test(flavor = "multi_thread", worker_threads = 1)]
-    /// test that /robots.txt is correctly requested given a base url (happy path)
-    async fn request_robots_txt_with_and_without_proxy() {
-        let srv = MockServer::start();
-
-        let mock = srv.mock(|when, then| {
-            when.method(GET).path("/robots.txt");
-            then.status(200).body("this is a test");
-        });
-
-        let mut config = Configuration::default();
-
-        let (tx, _): FeroxChannel<StatCommand> = mpsc::unbounded_channel();
-
-        request_robots_txt(&srv.url("/api/users/stuff/things"), &config, tx.clone()).await;
-
-        // note: the proxy doesn't actually do anything other than hit a different code branch
-        // in this unit test; it would however have an effect on an integration test
-        config.proxy = srv.url("/ima-proxy");
-
-        request_robots_txt(&srv.url("/api/different/path"), &config, tx).await;
-
-        assert_eq!(mock.hits(), 2);
-    }
-}
--- a/src/extractor/builder.rs
+++ b/src/extractor/builder.rs
@@ -0,0 +1,171 @@
+use super::*;
+use anyhow::{bail, Result};
+
+/// Regular expression used in [LinkFinder](https://github.com/GerbenJavado/LinkFinder)
+///
+/// Incorporates change from this [Pull Request](https://github.com/GerbenJavado/LinkFinder/pull/66/files)
+const LINKFINDER_REGEX: &str = r#"(?:"|')(((?:[a-zA-Z]{1,10}://|//)[^"'/]{1,}\.[a-zA-Z]{2,}[^"']{0,})|((?:/|\.\./|\./)[^"'><,;| *()(%%$^/\\\[\]][^"'><,;|()]{1,})|([a-zA-Z0-9_\-/]{1,}/[a-zA-Z0-9_\-/]{1,}\.(?:[a-zA-Z]{1,4}|action)(?:[\?|#][^"|']{0,}|))|([a-zA-Z0-9_\-/]{1,}/[a-zA-Z0-9_\-/]{3,}(?:[\?|#][^"|']{0,}|))|([a-zA-Z0-9_\-.]{1,}\.(?:php|asp|aspx|jsp|json|action|html|js|txt|xml)(?:[\?|#][^"|']{0,}|)))(?:"|')"#;
+
+/// Regular expression to pull url paths from robots.txt
+///
+/// ref: https://developers.google.com/search/reference/robots_txt
+const ROBOTS_TXT_REGEX: &str =
+    r#"(?m)^ *(Allow|Disallow): *(?P<url_path>[a-zA-Z0-9._/?#@!&'()+,;%=-]+?)$"#; // multi-line (?m)
+
+/// Which type of extraction should be performed
+#[derive(Debug, Copy, Clone)]
+pub enum ExtractionTarget {
+    /// Examine a response body and extract links
+    ResponseBody,
+
+    /// Examine robots.txt (specifically) and extract links
+    RobotsTxt,
+}
+
+/// responsible for building an `Extractor`
+pub struct ExtractorBuilder<'a> {
+    /// Response from which to extract links
+    response: Option<&'a FeroxResponse>,
+
+    /// Response from which to extract links
+    url: String,
+
+    /// Whether or not to try recursion
+    config: Option<&'a Configuration>,
+
+    /// transmitter to the mpsc that handles statistics gathering
+    tx_stats: Option<UnboundedSender<StatCommand>>,
+
+    /// transmitter to the mpsc that handles recursive scan calls
+    tx_recursion: Option<UnboundedSender<String>>,
+
+    /// transmitter to the mpsc that handles reporting information to the user
+    tx_reporter: Option<UnboundedSender<FeroxResponse>>,
+
+    /// list of urls that will be added to when new urls are extracted
+    scanned_urls: Option<&'a FeroxScans>,
+
+    /// depth at which the scan was started
+    depth: Option<usize>,
+
+    /// copy of Stats object
+    stats: Option<Arc<Stats>>,
+
+    /// type of extraction to be performed
+    target: Option<ExtractionTarget>,
+}
+
+/// ExtractorBuilder implementation
+impl<'a> ExtractorBuilder<'a> {
+    /// Given a FeroxResponse, create new ExtractorBuilder
+    ///
+    /// Once built, Extractor::target is ExtractionTarget::ResponseBody
+    pub fn with_response(response: &'a FeroxResponse) -> Self {
+        Self {
+            response: Some(response),
+            url: "".to_string(),
+            config: None,
+            tx_stats: None,
+            tx_recursion: None,
+            tx_reporter: None,
+            scanned_urls: None,
+            depth: None,
+            stats: None,
+            target: None,
+        }
+    }
+
+    /// Given a url and Stats transmitter, create new ExtractorBuilder
+    ///
+    /// Once built, Extractor::target is ExtractionTarget::ResponseBody
+    pub fn with_url(url: &str) -> Self {
+        Self {
+            response: None,
+            url: url.to_string(),
+            config: None,
+            tx_stats: None,
+            tx_recursion: None,
+            tx_reporter: None,
+            scanned_urls: None,
+            depth: None,
+            stats: None,
+            target: None,
+        }
+    }
+
+    /// builder call to set `config`
+    pub fn config(&mut self, config: &'a Configuration) -> &mut Self {
+        self.config = Some(config);
+        self
+    }
+
+    /// builder call to set `tx_recursion`
+    pub fn recursion_transmitter(&mut self, tx_recursion: UnboundedSender<String>) -> &mut Self {
+        self.tx_recursion = Some(tx_recursion);
+        self
+    }
+
+    /// builder call to set `tx_stats`
+    pub fn stats_transmitter(&mut self, tx_stats: UnboundedSender<StatCommand>) -> &mut Self {
+        self.tx_stats = Some(tx_stats);
+        self
+    }
+
+    /// builder call to set `tx_reporter`
+    pub fn reporter_transmitter(
+        &mut self,
+        tx_reporter: UnboundedSender<FeroxResponse>,
+    ) -> &mut Self {
+        self.tx_reporter = Some(tx_reporter);
+        self
+    }
+
+    /// builder call to set `scanned_urls`
+    pub fn scanned_urls(&mut self, scanned_urls: &'a FeroxScans) -> &mut Self {
+        self.scanned_urls = Some(scanned_urls);
+        self
+    }
+
+    /// builder call to set `stats`
+    pub fn stats(&mut self, stats: Arc<Stats>) -> &mut Self {
+        self.stats = Some(stats);
+        self
+    }
+
+    /// builder call to set `depth`
+    pub fn depth(&mut self, depth: usize) -> &mut Self {
+        self.depth = Some(depth);
+        self
+    }
+
+    /// builder call to set `target`
+    pub fn target(&mut self, target: ExtractionTarget) -> &mut Self {
+        self.target = Some(target);
+        self
+    }
+
+    pub fn build(&self) -> Result<Extractor<'a>> {
+        if self.url.is_empty() && self.response.is_none() {
+            bail!("Extractor requires either a URL or a FeroxResponse be specified")
+        }
+
+        Ok(Extractor {
+            links_regex: Regex::new(LINKFINDER_REGEX).unwrap(),
+            robots_regex: Regex::new(ROBOTS_TXT_REGEX).unwrap(),
+            response: if self.response.is_some() {
+                Some(self.response.unwrap())
+            } else {
+                None
+            },
+            url: self.url.to_owned(),
+            config: self.config.unwrap(),
+            tx_stats: self.tx_stats.as_ref().unwrap().clone(),
+            tx_recursion: self.tx_recursion.as_ref().unwrap().clone(),
+            tx_reporter: self.tx_reporter.as_ref().unwrap().clone(),
+            scanned_urls: self.scanned_urls.unwrap(),
+            depth: self.depth.unwrap(),
+            stats: self.stats.as_ref().unwrap().clone(),
+            target: self.target.unwrap(),
+        })
+    }
+}
--- a/src/extractor/container.rs
+++ b/src/extractor/container.rs
@@ -0,0 +1,408 @@
+use super::*;
+use crate::{
+    client,
+    scanner::{send_report, should_filter_response, try_recursion},
+    statistics::{
+        StatCommand::UpdateUsizeField,
+        StatField::{LinksExtracted, TotalExpected},
+    },
+    update_stat,
+    utils::{format_url, make_request},
+};
+use anyhow::{bail, Context, Result};
+use reqwest::{StatusCode, Url};
+use std::collections::HashSet;
+
+/// Whether an active scan is recursive or not
+#[derive(Debug)]
+enum RecursionStatus {
+    /// Scan is recursive
+    Recursive,
+
+    /// Scan is not recursive
+    NotRecursive,
+}
+
+/// Handles all logic related to extracting links from requested source code
+#[derive(Debug)]
+pub struct Extractor<'a> {
+    /// `LINKFINDER_REGEX` as a regex::Regex type
+    pub(super) links_regex: Regex,
+
+    /// `ROBOTS_TXT_REGEX` as a regex::Regex type
+    pub(super) robots_regex: Regex,
+
+    /// Response from which to extract links
+    pub(super) response: Option<&'a FeroxResponse>,
+
+    /// Response from which to extract links
+    pub(super) url: String,
+
+    /// Whether or not to try recursion
+    pub(super) config: &'a Configuration,
+
+    /// transmitter to the mpsc that handles statistics gathering
+    pub(super) tx_stats: UnboundedSender<StatCommand>,
+
+    /// transmitter to the mpsc that handles recursive scan calls
+    pub(super) tx_recursion: UnboundedSender<String>,
+
+    /// transmitter to the mpsc that handles reporting information to the user
+    pub(super) tx_reporter: UnboundedSender<FeroxResponse>,
+
+    /// list of urls that will be added to when new urls are extracted
+    pub(super) scanned_urls: &'a FeroxScans,
+
+    /// depth at which the scan was started
+    pub(super) depth: usize,
+
+    /// copy of Stats object
+    pub(super) stats: Arc<Stats>,
+
+    /// type of extraction to be performed
+    pub(super) target: ExtractionTarget,
+}
+
+/// Extractor implementation
+impl<'a> Extractor<'a> {
+    /// business logic that handles getting links from a normal http body response
+    pub async fn extract(&self) -> Result<()> {
+        let links = match self.target {
+            ExtractionTarget::ResponseBody => self.extract_from_body().await?,
+            ExtractionTarget::RobotsTxt => self.extract_from_robots().await?,
+        };
+
+        let recursive = if self.config.no_recursion {
+            RecursionStatus::NotRecursive
+        } else {
+            RecursionStatus::Recursive
+        };
+
+        for link in links {
+            // todo rename get_feroxresponse_from_link
+            let mut resp = match self.get_feroxresponse_from_link(&link).await {
+                Ok(resp) => resp,
+                Err(_) => continue,
+            };
+
+            // filter if necessary
+            if should_filter_response(&resp, self.tx_stats.clone()) {
+                continue;
+            }
+
+            if resp.is_file() {
+                // very likely a file, simply request and report
+                log::debug!("Extracted file: {}", resp);
+
+                self.scanned_urls
+                    .add_file_scan(&resp.url().to_string(), self.stats.clone());
+
+                send_report(self.tx_reporter.clone(), resp);
+
+                continue;
+            }
+
+            if matches!(recursive, RecursionStatus::Recursive) {
+                log::debug!("Extracted Directory: {}", resp);
+
+                if !resp.url().as_str().ends_with('/')
+                    && (resp.status().is_success()
+                        || matches!(resp.status(), &StatusCode::FORBIDDEN))
+                {
+                    // if the url doesn't end with a /
+                    // and the response code is either a 2xx or 403
+
+                    // since all of these are 2xx or 403, recursion is only attempted if the
+                    // url ends in a /. I am actually ok with adding the slash and not
+                    // adding it, as both have merit.  Leaving it in for now to see how
+                    // things turn out (current as of: v1.1.0)
+                    resp.set_url(&format!("{}/", resp.url()));
+                }
+
+                try_recursion(&resp, self.depth, self.tx_recursion.clone()).await;
+            }
+        }
+        Ok(())
+    }
+
+    /// Given a `reqwest::Response`, perform the following actions
+    ///   - parse the response's text for links using the linkfinder regex
+    ///   - for every link found take its url path and parse each sub-path
+    ///     - example: Response contains a link fragment `homepage/assets/img/icons/handshake.svg`
+    ///       with a base url of http://localhost, the following urls would be returned:
+    ///         - homepage/assets/img/icons/handshake.svg
+    ///         - homepage/assets/img/icons/
+    ///         - homepage/assets/img/
+    ///         - homepage/assets/
+    ///         - homepage/
+    pub(super) async fn extract_from_body(&self) -> Result<HashSet<String>> {
+        log::trace!("enter: get_links");
+
+        let mut links = HashSet::<String>::new();
+
+        let body = self.response.unwrap().text();
+
+        for capture in self.links_regex.captures_iter(&body) {
+            // remove single & double quotes from both ends of the capture
+            // capture[0] is the entire match, additional capture groups start at [1]
+            let link = capture[0].trim_matches(|c| c == '\'' || c == '"');
+
+            match Url::parse(link) {
+                Ok(absolute) => {
+                    if absolute.domain() != self.response.unwrap().url().domain()
+                        || absolute.host() != self.response.unwrap().url().host()
+                    {
+                        // domains/ips are not the same, don't scan things that aren't part of the original
+                        // target url
+                        continue;
+                    }
+
+                    if self.add_all_sub_paths(absolute.path(), &mut links).is_err() {
+                        log::warn!("could not add sub-paths from {} to {:?}", absolute, links);
+                    }
+                }
+                Err(e) => {
+                    // this is the expected error that happens when we try to parse a url fragment
+                    //     ex: Url::parse("/login") -> Err("relative URL without a base")
+                    // while this is technically an error, these are good results for us
+                    if e.to_string().contains("relative URL without a base") {
+                        if self.add_all_sub_paths(link, &mut links).is_err() {
+                            log::warn!("could not add sub-paths from {} to {:?}", link, links);
+                        }
+                    } else {
+                        // unexpected error has occurred
+                        log::error!("Could not parse given url: {}", e);
+                    }
+                }
+            }
+        }
+
+        self.update_stats(links.len());
+
+        log::trace!("exit: get_links -> {:?}", links);
+
+        Ok(links)
+    }
+
+    /// take a url fragment like homepage/assets/img/icons/handshake.svg and
+    /// incrementally add
+    ///     - homepage/assets/img/icons/
+    ///     - homepage/assets/img/
+    ///     - homepage/assets/
+    ///     - homepage/
+    fn add_all_sub_paths(&self, url_path: &str, mut links: &mut HashSet<String>) -> Result<()> {
+        log::trace!("enter: add_all_sub_paths({}, {:?})", url_path, links);
+
+        for sub_path in self.get_sub_paths_from_path(url_path) {
+            self.add_link_to_set_of_links(&sub_path, &mut links)?;
+        }
+
+        log::trace!("exit: add_all_sub_paths");
+        Ok(())
+    }
+
+    /// Iterate over a given path, return a list of every sub-path found
+    ///
+    /// example: `path` contains a link fragment `homepage/assets/img/icons/handshake.svg`
+    /// the following fragments would be returned:
+    ///   - homepage/assets/img/icons/handshake.svg
+    ///   - homepage/assets/img/icons/
+    ///   - homepage/assets/img/
+    ///   - homepage/assets/
+    ///   - homepage/
+    pub(super) fn get_sub_paths_from_path(&self, path: &str) -> Vec<String> {
+        log::trace!("enter: get_sub_paths_from_path({})", path);
+        let mut paths = vec![];
+
+        // filter out any empty strings caused by .split
+        let mut parts: Vec<&str> = path.split('/').filter(|s| !s.is_empty()).collect();
+
+        let length = parts.len();
+
+        for i in 0..length {
+            // iterate over all parts of the path
+            if parts.is_empty() {
+                // pop left us with an empty vector, we're done
+                break;
+            }
+
+            let mut possible_path = parts.join("/");
+
+            if possible_path.is_empty() {
+                // .join can result in an empty string, which we don't need, ignore
+                continue;
+            }
+
+            if i > 0 {
+                // this isn't the last index of the parts array
+                // ex: /buried/misc/stupidfile.php
+                // this block skips the file but sees all parent folders
+                possible_path = format!("{}/", possible_path);
+            }
+
+            paths.push(possible_path); // good sub-path found
+            parts.pop(); // use .pop() to remove the last part of the path and continue iteration
+        }
+
+        log::trace!("exit: get_sub_paths_from_path -> {:?}", paths);
+        paths
+    }
+
+    /// simple helper to stay DRY, trys to join a url + fragment and add it to the `links` HashSet
+    pub(super) fn add_link_to_set_of_links(
+        &self,
+        link: &str,
+        links: &mut HashSet<String>,
+    ) -> Result<()> {
+        log::trace!("enter: add_link_to_set_of_links({}, {:?})", link, links);
+
+        let old_url = match self.target {
+            ExtractionTarget::ResponseBody => self.response.unwrap().url.clone(),
+            ExtractionTarget::RobotsTxt => match Url::parse(&self.url) {
+                Ok(u) => u,
+                Err(e) => {
+                    bail!("Could not parse {}: {}", self.url, e);
+                }
+            },
+        };
+
+        let new_url = old_url
+            .join(&link)
+            .with_context(|| format!("Could not join {} with {}", old_url, link))?;
+
+        links.insert(new_url.to_string());
+
+        log::trace!("exit: add_link_to_set_of_links");
+
+        Ok(())
+    }
+
+    /// Wrapper around link extraction logic
+    /// currently used in two places:
+    ///   - links from response bodies
+    ///   - links from robots.txt responses
+    ///
+    /// general steps taken:
+    ///   - create a new Url object based on cli options/args
+    ///   - check if the new Url has already been seen/scanned -> None
+    ///   - make a request to the new Url ? -> Some(response) : None
+    pub(super) async fn get_feroxresponse_from_link(&self, url: &str) -> Result<FeroxResponse> {
+        log::trace!("enter: get_feroxresponse_from_link({})", url);
+
+        // create a url based on the given command line options, return None on error
+        let new_url = format_url(
+            &url,
+            &"",
+            self.config.add_slash,
+            &self.config.queries,
+            None,
+            self.tx_stats.clone(),
+        )?;
+
+        if self
+            .scanned_urls
+            .get_scan_by_url(&new_url.to_string())
+            .is_some()
+        {
+            //we've seen the url before and don't need to scan again
+            log::trace!("exit: get_feroxresponse_from_link -> None");
+            bail!("previously seen url");
+        }
+
+        // make the request and store the response
+        let new_response =
+            make_request(&self.config.client, &new_url, self.tx_stats.clone()).await?;
+
+        let new_ferox_response = FeroxResponse::from(new_response, true).await;
+
+        log::trace!(
+            "exit: get_feroxresponse_from_link -> {:?}",
+            new_ferox_response
+        );
+
+        Ok(new_ferox_response)
+    }
+
+    /// Entry point to perform link extraction from robots.txt
+    ///
+    /// `base_url` can have paths and subpaths, however robots.txt will be requested from the
+    /// root of the url
+    /// given the url:
+    ///     http://localhost/stuff/things
+    /// this function requests:
+    ///     http://localhost/robots.txt
+    pub(super) async fn extract_from_robots(&self) -> Result<HashSet<String>> {
+        log::trace!("enter: extract_robots_txt");
+
+        let mut links: HashSet<String> = HashSet::new();
+
+        let response = self.request_robots_txt().await?;
+
+        for capture in self.robots_regex.captures_iter(response.text.as_str()) {
+            if let Some(new_path) = capture.name("url_path") {
+                let mut new_url = Url::parse(&self.url)?;
+                new_url.set_path(new_path.as_str());
+                if self.add_all_sub_paths(&new_url.path(), &mut links).is_err() {
+                    log::warn!("could not add sub-paths from {} to {:?}", new_url, links);
+                }
+            }
+        }
+
+        self.update_stats(links.len());
+
+        log::trace!("exit: extract_robots_txt -> {:?}", links);
+        Ok(links)
+    }
+
+    /// helper function that simply requests /robots.txt on the given url's base url
+    ///
+    /// example:
+    ///     http://localhost/api/users -> http://localhost/robots.txt
+    ///     
+    /// The length of the given path has no effect on what's requested; it's always
+    /// base url + /robots.txt
+    pub(super) async fn request_robots_txt(&self) -> Result<FeroxResponse> {
+        log::trace!("enter: get_robots_file");
+
+        // more often than not, domain/robots.txt will redirect to www.domain/robots.txt or something
+        // similar; to account for that, create a client that will follow redirects, regardless of
+        // what the user specified for the scanning client. Other than redirects, it will respect
+        // all other user specified settings
+        let follow_redirects = true;
+
+        let proxy = if self.config.proxy.is_empty() {
+            None
+        } else {
+            Some(self.config.proxy.as_str())
+        };
+
+        let client = client::initialize(
+            self.config.timeout,
+            &self.config.user_agent,
+            follow_redirects,
+            self.config.insecure,
+            &self.config.headers,
+            proxy,
+        );
+
+        let mut url = Url::parse(&self.url)?;
+        url.set_path("/robots.txt"); // overwrite existing path with /robots.txt
+
+        let response = make_request(&client, &url, self.tx_stats.clone()).await?;
+        let ferox_response = FeroxResponse::from(response, true).await;
+
+        log::trace!("exit: get_robots_file -> {}", ferox_response);
+        return Ok(ferox_response);
+    }
+
+    /// update total number of links extracted and expected responses
+    fn update_stats(&self, num_links: usize) {
+        let multiplier = self.config.extensions.len().max(1);
+
+        update_stat!(self.tx_stats, UpdateUsizeField(LinksExtracted, num_links));
+        update_stat!(
+            self.tx_stats,
+            UpdateUsizeField(TotalExpected, num_links * multiplier)
+        );
+    }
+}
--- a/src/extractor/mod.rs
+++ b/src/extractor/mod.rs
@@ -0,0 +1,19 @@
+//! extract links from html source and robots.txt
+mod builder;
+mod container;
+#[cfg(test)]
+mod tests;
+
+pub use self::builder::ExtractionTarget;
+pub use self::builder::ExtractorBuilder;
+pub use self::container::Extractor;
+
+use crate::{
+    config::Configuration,
+    scan_manager::FeroxScans,
+    statistics::{StatCommand, Stats},
+    FeroxResponse,
+};
+use regex::Regex;
+use std::sync::Arc;
+use tokio::sync::mpsc::UnboundedSender;
--- a/src/extractor/tests.rs
+++ b/src/extractor/tests.rs
@@ -0,0 +1,372 @@
+use super::*;
+use crate::utils::make_request;
+use crate::FeroxChannel;
+use anyhow::Result;
+use httpmock::Method::GET;
+use httpmock::MockServer;
+use lazy_static::lazy_static;
+use reqwest::{header::HeaderMap, Client, StatusCode, Url};
+use std::collections::HashSet;
+use tokio::sync::mpsc;
+
+lazy_static! {
+    /// Extractor for testing robots.txt
+    static ref ROBOTS_EXT: Extractor<'static> = setup_extractor(ExtractionTarget::RobotsTxt);
+
+    /// Extractor for testing response bodies
+    static ref BODY_EXT: Extractor<'static> = setup_extractor(ExtractionTarget::ResponseBody);
+
+    /// Configuration for Extractor
+    static ref CONFIG: Configuration = Configuration::new();
+
+    /// FeroxScans for Extractor
+    static ref SCANS: FeroxScans = FeroxScans::default();
+
+    /// FeroxResponse for Extractor
+    static ref RESPONSE: FeroxResponse = get_test_response();
+}
+
+fn get_test_response() -> FeroxResponse {
+    FeroxResponse {
+        text: String::new(),
+        wildcard: true,
+        url: Url::parse("https://localhost").unwrap(),
+        content_length: 125,
+        word_count: 10,
+        line_count: 14,
+        headers: HeaderMap::new(),
+        status: StatusCode::OK,
+    }
+}
+
+/// creates a single extractor that can be used to test standalone functions
+fn setup_extractor(target: ExtractionTarget) -> Extractor<'static> {
+    let (tx_dir, _): FeroxChannel<String> = mpsc::unbounded_channel();
+    let (tx_stats, _): FeroxChannel<StatCommand> = mpsc::unbounded_channel();
+    let (tx_term, _): FeroxChannel<FeroxResponse> = mpsc::unbounded_channel();
+    let stats = Arc::new(Stats::new());
+
+    let mut builder = match target {
+        ExtractionTarget::ResponseBody => ExtractorBuilder::with_response(&RESPONSE),
+        ExtractionTarget::RobotsTxt => ExtractorBuilder::with_url("https://localhost"),
+    };
+
+    builder
+        .target(target)
+        .depth(4)
+        .config(&CONFIG)
+        .recursion_transmitter(tx_dir.clone())
+        .stats_transmitter(tx_stats.clone())
+        .reporter_transmitter(tx_term.clone())
+        .scanned_urls(&SCANS)
+        .stats(stats.clone())
+        .build()
+        .unwrap()
+}
+
+#[test]
+/// extract sub paths from the given url fragment; expect 4 sub paths and that all are
+/// in the expected array
+fn extractor_get_sub_paths_from_path_with_multiple_paths() {
+    let path = "homepage/assets/img/icons/handshake.svg";
+    let r_paths = ROBOTS_EXT.get_sub_paths_from_path(&path);
+    let b_paths = BODY_EXT.get_sub_paths_from_path(&path);
+    let expected = vec![
+        "homepage/",
+        "homepage/assets/",
+        "homepage/assets/img/",
+        "homepage/assets/img/icons/",
+        "homepage/assets/img/icons/handshake.svg",
+    ];
+
+    assert_eq!(r_paths.len(), expected.len());
+    assert_eq!(b_paths.len(), expected.len());
+    for expected_path in expected {
+        assert_eq!(r_paths.contains(&expected_path.to_string()), true);
+        assert_eq!(b_paths.contains(&expected_path.to_string()), true);
+    }
+}
+
+#[test]
+/// extract sub paths from the given url fragment; expect 2 sub paths and that all are
+/// in the expected array. the fragment is wrapped in slashes to ensure no empty strings are
+/// returned
+fn extractor_get_sub_paths_from_path_with_enclosing_slashes() {
+    let path = "/homepage/assets/";
+    let r_paths = ROBOTS_EXT.get_sub_paths_from_path(&path);
+    let b_paths = BODY_EXT.get_sub_paths_from_path(&path);
+    let expected = vec!["homepage/", "homepage/assets"];
+
+    assert_eq!(r_paths.len(), expected.len());
+    assert_eq!(b_paths.len(), expected.len());
+    for expected_path in expected {
+        assert_eq!(r_paths.contains(&expected_path.to_string()), true);
+        assert_eq!(b_paths.contains(&expected_path.to_string()), true);
+    }
+}
+
+#[test]
+/// extract sub paths from the given url fragment; expect 1 sub path, no forward slashes are
+/// included
+fn extractor_get_sub_paths_from_path_with_only_a_word() {
+    let path = "homepage";
+    let r_paths = ROBOTS_EXT.get_sub_paths_from_path(&path);
+    let b_paths = BODY_EXT.get_sub_paths_from_path(&path);
+    let expected = vec!["homepage"];
+
+    assert_eq!(r_paths.len(), expected.len());
+    assert_eq!(b_paths.len(), expected.len());
+    for expected_path in expected {
+        assert_eq!(r_paths.contains(&expected_path.to_string()), true);
+        assert_eq!(b_paths.contains(&expected_path.to_string()), true);
+    }
+}
+
+#[test]
+/// extract sub paths from the given url fragment; expect 1 sub path, forward slash removed
+fn extractor_get_sub_paths_from_path_with_an_absolute_word() {
+    let path = "/homepage";
+    let r_paths = ROBOTS_EXT.get_sub_paths_from_path(&path);
+    let b_paths = BODY_EXT.get_sub_paths_from_path(&path);
+    let expected = vec!["homepage"];
+
+    assert_eq!(r_paths.len(), expected.len());
+    assert_eq!(b_paths.len(), expected.len());
+    for expected_path in expected {
+        assert_eq!(r_paths.contains(&expected_path.to_string()), true);
+        assert_eq!(b_paths.contains(&expected_path.to_string()), true);
+    }
+}
+#[test]
+/// test that an ExtractorBuilder without a FeroxResponse and without a URL bails
+fn extractor_builder_bails_when_neither_required_field_is_set() {
+    let (tx_dir, _): FeroxChannel<String> = mpsc::unbounded_channel();
+    let (tx_stats, _): FeroxChannel<StatCommand> = mpsc::unbounded_channel();
+    let (tx_term, _): FeroxChannel<FeroxResponse> = mpsc::unbounded_channel();
+    let stats = Arc::new(Stats::new());
+
+    let extractor = ExtractorBuilder::with_url("")
+        .target(ExtractionTarget::ResponseBody)
+        .depth(4)
+        .config(&CONFIG)
+        .recursion_transmitter(tx_dir.clone())
+        .stats_transmitter(tx_stats.clone())
+        .reporter_transmitter(tx_term.clone())
+        .scanned_urls(&SCANS)
+        .stats(stats.clone())
+        .build();
+
+    assert!(extractor.is_err());
+}
+
+#[test]
+/// test that a full url and fragment are joined correctly, then added to the given list
+/// i.e. the happy path
+fn extractor_add_link_to_set_of_links_happy_path() {
+    let mut r_links = HashSet::<String>::new();
+    let r_link = "admin";
+    let mut b_links = HashSet::<String>::new();
+    let b_link = "shmadmin";
+
+    assert_eq!(r_links.len(), 0);
+    ROBOTS_EXT
+        .add_link_to_set_of_links(r_link, &mut r_links)
+        .unwrap();
+
+    assert_eq!(r_links.len(), 1);
+    assert!(r_links.contains("https://localhost/admin"));
+
+    assert_eq!(b_links.len(), 0);
+
+    BODY_EXT
+        .add_link_to_set_of_links(b_link, &mut b_links)
+        .unwrap();
+
+    assert_eq!(b_links.len(), 1);
+    assert!(b_links.contains("https://localhost/shmadmin"));
+}
+
+#[test]
+/// test that an invalid path fragment doesn't add anything to the set of links
+fn extractor_add_link_to_set_of_links_with_non_base_url() {
+    let mut links = HashSet::<String>::new();
+    let link = "\\\\\\\\";
+
+    assert_eq!(links.len(), 0);
+    assert!(ROBOTS_EXT
+        .add_link_to_set_of_links(link, &mut links)
+        .is_err());
+    assert!(BODY_EXT.add_link_to_set_of_links(link, &mut links).is_err());
+
+    assert_eq!(links.len(), 0);
+    assert!(links.is_empty());
+}
+
+#[tokio::test(flavor = "multi_thread", worker_threads = 1)]
+/// use make_request to generate a Response, and use the Response to test get_links;
+/// the response will contain an absolute path to a domain that is not part of the scanned
+/// domain; expect an empty set returned
+async fn extractor_get_links_with_absolute_url_that_differs_from_target_domain() -> Result<()> {
+    let (tx_dir, _): FeroxChannel<String> = mpsc::unbounded_channel();
+    let (tx_stats, _): FeroxChannel<StatCommand> = mpsc::unbounded_channel();
+    let (tx_term, _): FeroxChannel<FeroxResponse> = mpsc::unbounded_channel();
+    let stats = Arc::new(Stats::new());
+
+    let srv = MockServer::start();
+
+    let mock = srv.mock(|when, then| {
+        when.method(GET).path("/some-path");
+        then.status(200).body(
+            "\"http://defintely.not.a.thing.probably.com/homepage/assets/img/icons/handshake.svg\"",
+        );
+    });
+
+    let client = Client::new();
+    let url = Url::parse(&srv.url("/some-path")).unwrap();
+
+    let response = make_request(&client, &url, tx_stats.clone()).await.unwrap();
+
+    let ferox_response = FeroxResponse::from(response, true).await;
+
+    let extractor = ExtractorBuilder::with_response(&ferox_response)
+        .target(ExtractionTarget::ResponseBody)
+        .depth(4)
+        .config(&CONFIG)
+        .recursion_transmitter(tx_dir.clone())
+        .stats_transmitter(tx_stats.clone())
+        .reporter_transmitter(tx_term.clone())
+        .scanned_urls(&SCANS)
+        .stats(stats.clone())
+        .build()?;
+
+    let links = extractor.extract_from_body().await?;
+
+    assert!(links.is_empty());
+
+    assert_eq!(mock.hits(), 1);
+    Ok(())
+}
+
+#[tokio::test(flavor = "multi_thread", worker_threads = 1)]
+/// test that /robots.txt is correctly requested given a base url (happy path)
+async fn request_robots_txt_without_proxy() -> Result<()> {
+    let (tx_dir, _): FeroxChannel<String> = mpsc::unbounded_channel();
+    let (tx_stats, _): FeroxChannel<StatCommand> = mpsc::unbounded_channel();
+    let (tx_term, _): FeroxChannel<FeroxResponse> = mpsc::unbounded_channel();
+    let stats = Arc::new(Stats::new());
+    let config = Configuration::new();
+
+    let srv = MockServer::start();
+
+    let mock = srv.mock(|when, then| {
+        when.method(GET).path("/robots.txt");
+        then.status(200).body("this is a test");
+    });
+
+    let extractor = ExtractorBuilder::with_url(&srv.url("/api/users/stuff/things"))
+        .target(ExtractionTarget::RobotsTxt)
+        .depth(4)
+        .config(&config)
+        .recursion_transmitter(tx_dir.clone())
+        .stats_transmitter(tx_stats.clone())
+        .reporter_transmitter(tx_term.clone())
+        .scanned_urls(&SCANS)
+        .stats(stats.clone())
+        .build()?;
+
+    let resp = extractor.request_robots_txt().await?;
+
+    assert!(matches!(resp.status(), &StatusCode::OK));
+    println!("{}", resp);
+    assert_eq!(resp.content_length(), 14);
+    assert_eq!(mock.hits(), 1);
+    Ok(())
+}
+
+#[tokio::test(flavor = "multi_thread", worker_threads = 1)]
+/// test that /robots.txt is correctly requested given a base url (happy path) when a proxy is used
+async fn request_robots_txt_with_proxy() -> Result<()> {
+    let (tx_dir, _): FeroxChannel<String> = mpsc::unbounded_channel();
+    let (tx_stats, _): FeroxChannel<StatCommand> = mpsc::unbounded_channel();
+    let (tx_term, _): FeroxChannel<FeroxResponse> = mpsc::unbounded_channel();
+    let stats = Arc::new(Stats::new());
+    let mut config = Configuration::new();
+
+    let srv = MockServer::start();
+
+    let mock = srv.mock(|when, then| {
+        when.method(GET).path("/robots.txt");
+        then.status(200).body("this is also a test");
+    });
+
+    // note: the proxy doesn't actually do anything other than hit a different code branch
+    // in this unit test; it would however have an effect on an integration test
+    config.proxy = srv.url("/ima-proxy");
+
+    let extractor = ExtractorBuilder::with_url(&srv.url("/api/different/path"))
+        .target(ExtractionTarget::RobotsTxt)
+        .depth(4)
+        .config(&config)
+        .recursion_transmitter(tx_dir.clone())
+        .stats_transmitter(tx_stats.clone())
+        .reporter_transmitter(tx_term.clone())
+        .scanned_urls(&SCANS)
+        .stats(stats.clone())
+        .build()?;
+
+    let resp = extractor.request_robots_txt().await?;
+
+    assert!(matches!(resp.status(), &StatusCode::OK));
+    assert_eq!(resp.content_length(), 19);
+    assert_eq!(mock.hits(), 1);
+    Ok(())
+}
+
+#[tokio::test(flavor = "multi_thread", worker_threads = 1)]
+/// get_feroxresponse_from_link's happy path, expect back a FeroxResponse
+async fn get_feroxresponse_from_link_happy_path() -> Result<()> {
+    let srv = MockServer::start();
+
+    let mock = srv.mock(|when, then| {
+        when.method(GET).path("/login.php");
+        then.status(200).body("this is a test");
+    });
+
+    let r_resp = ROBOTS_EXT
+        .get_feroxresponse_from_link(&srv.url("/login.php"))
+        .await?;
+    let b_resp = BODY_EXT
+        .get_feroxresponse_from_link(&srv.url("/login.php"))
+        .await?;
+
+    assert!(matches!(r_resp.status(), &StatusCode::OK));
+    assert!(matches!(b_resp.status(), &StatusCode::OK));
+    assert_eq!(r_resp.content_length(), 14);
+    assert_eq!(b_resp.content_length(), 14);
+    assert_eq!(mock.hits(), 2);
+    Ok(())
+}
+
+#[tokio::test(flavor = "multi_thread", worker_threads = 1)]
+/// get_feroxresponse_from_link should bail in the event that the url is already in scanned_urls
+async fn get_feroxresponse_from_link_bails_on_seen_url() -> Result<()> {
+    let url = "/unique-for-this-test.php";
+    let srv = MockServer::start();
+    let served = srv.url(url);
+
+    let mock = srv.mock(|when, then| {
+        when.method(GET).path(url);
+        then.status(200)
+            .body("this is a unique test, don't reuse the endpoint");
+    });
+
+    SCANS.add_file_scan(&served, ROBOTS_EXT.stats.clone());
+
+    let r_resp = ROBOTS_EXT.get_feroxresponse_from_link(&served).await;
+    let b_resp = BODY_EXT.get_feroxresponse_from_link(&served).await;
+
+    assert!(r_resp.is_err());
+    assert!(b_resp.is_err());
+    assert_eq!(mock.hits(), 0); // function exits before requests can happen
+    Ok(())
+}
--- a/src/filters/helpers.rs
+++ b/src/filters/helpers.rs
@@ -0,0 +1,36 @@
+// use super::WildcardFilter;
+// use crate::{
+//     statistics::{
+//         StatCommand::{self, UpdateUsizeField},
+//         StatField::WildcardsFiltered,
+//     },
+//     FeroxResponse,
+// };
+// use anyhow::Result;
+// use tokio::sync::mpsc::UnboundedSender;
+//
+// /// Simple helper to stay DRY; determines whether or not a given `FeroxResponse` should be reported
+// /// to the user or not.
+// pub fn should_filter_response(
+//     response: &FeroxResponse,
+//     tx_stats: UnboundedSender<StatCommand>,
+// ) -> Result<bool> {
+//     let filters = FILTERS
+//     match FILTERS.read() {
+//         Ok(filters) => {
+//             for filter in filters.iter() {
+//                 // wildcard.should_filter goes here
+//                 if filter.should_filter_response(&response) {
+//                     if filter.as_any().downcast_ref::<WildcardFilter>().is_some() {
+//                         update_stat!(tx_stats, UpdateUsizeField(WildcardsFiltered, 1))
+//                     }
+//                     return true;
+//                 }
+//             }
+//         }
+//         Err(e) => {
+//             log::error!("{}", e);
+//         }
+//     }
+//     false
+// }
--- a/src/filters/mod.rs
+++ b/src/filters/mod.rs
@@ -8,6 +8,7 @@ mod regex;
 mod similarity;
 #[cfg(test)]
 mod tests;
+mod helpers;

 pub use self::lines::LinesFilter;
 pub use self::regex::RegexFilter;
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,19 +1,19 @@
-pub mod utils;
-pub mod client;
+pub mod banner;
 pub mod config;
-pub mod extractor;
-pub mod filters;
+mod client;
+mod event_handlers;
+mod filters;
 pub mod heuristics;
 pub mod logger;
-pub mod parser;
+mod parser;
 pub mod progress;
 pub mod reporter;
 pub mod scan_manager;
 pub mod scanner;
 pub mod statistics;
-mod event_handlers;
-pub mod banner;
 mod traits;
+pub mod utils;
+mod extractor;

 use crate::{
    traits::FeroxSerialize,
--- a/src/reporter.rs
+++ b/src/reporter.rs
@@ -5,9 +5,11 @@ use crate::{
        StatCommand::{self, UpdateUsizeField},
        StatField::ResourcesDiscovered,
    },
+    update_stat,
    utils::{ferox_print, make_request, open_file},
    FeroxChannel, FeroxResponse, FeroxSerialize,
 };
+
 use console::strip_ansi_codes;
 use std::{
    fs, io,
--- a/src/scanner.rs
+++ b/src/scanner.rs
@@ -1,6 +1,6 @@
 use crate::{
    config::{Configuration, CONFIGURATION},
-    extractor::{extract_robots_txt, get_links, request_feroxresponse_from_new_link},
+    extractor::{ExtractionTarget, ExtractorBuilder},
    filters::{
        LinesFilter, RegexFilter, SimilarityFilter, SizeFilter, StatusCodeFilter, WildcardFilter,
        WordsFilter,
@@ -13,6 +13,7 @@ use crate::{
        Stats,
    },
    traits::FeroxFilter,
+    update_stat,
    utils::{format_url, get_current_depth, make_request},
    FeroxChannel, FeroxResponse, SIMILARITY_THRESHOLD,
 };
@@ -307,11 +308,12 @@ fn reached_max_depth(url: &Url, base_depth: usize, max_depth: usize) -> bool {
 /// Helper function that wraps logic to check for recursion opportunities
 ///
 /// When a recursion opportunity is found, the new url is sent across the recursion channel
-async fn try_recursion(
+pub async fn try_recursion(
    response: &FeroxResponse,
    base_depth: usize,
    transmitter: UnboundedSender<String>,
 ) {
+    // todo this should be part of the recursion handler
    log::trace!(
        "enter: try_recursion({}, {}, {:?})",
        response,
@@ -433,56 +435,19 @@ async fn make_requests(
            }

            if CONFIGURATION.extract_links && !ferox_response.status().is_redirection() {
-                let new_links = get_links(&ferox_response, tx_stats.clone()).await;
+                let extractor = ExtractorBuilder::with_response(&ferox_response)
+                    .target(ExtractionTarget::ResponseBody)
+                    .depth(base_depth)
+                    .config(&CONFIGURATION)
+                    .recursion_transmitter(dir_chan.clone())
+                    .stats_transmitter(tx_stats.clone())
+                    .reporter_transmitter(report_chan.clone())
+                    .scanned_urls(&SCANNED_URLS)
+                    .stats(stats.clone())
+                    .build()
+                    .unwrap(); // todo change once this function returns Result

-                for new_link in new_links {
-                    let mut new_ferox_response = match request_feroxresponse_from_new_link(
-                        &new_link,
-                        tx_stats.clone(),
-                    )
-                    .await
-                    {
-                        Some(resp) => resp,
-                        None => continue,
-                    };
-
-                    // filter if necessary
-                    if should_filter_response(&new_ferox_response, tx_stats.clone()) {
-                        continue;
-                    }
-
-                    if new_ferox_response.is_file() {
-                        // very likely a file, simply request and report
-                        log::debug!("Singular extraction: {}", new_ferox_response);
-
-                        SCANNED_URLS
-                            .add_file_scan(&new_ferox_response.url().to_string(), stats.clone());
-
-                        send_report(report_chan.clone(), new_ferox_response);
-
-                        continue;
-                    }
-
-                    if !CONFIGURATION.no_recursion {
-                        log::debug!("Recursive extraction: {}", new_ferox_response);
-
-                        if !new_ferox_response.url().as_str().ends_with('/')
-                            && (new_ferox_response.status().is_success()
-                                || matches!(new_ferox_response.status(), &StatusCode::FORBIDDEN))
-                        {
-                            // if the url doesn't end with a /
-                            // and the response code is either a 2xx or 403
-
-                            // since all of these are 2xx or 403, recursion is only attempted if the
-                            // url ends in a /. I am actually ok with adding the slash and not
-                            // adding it, as both have merit.  Leaving it in for now to see how
-                            // things turn out (current as of: v1.1.0)
-                            new_ferox_response.set_url(&format!("{}/", new_ferox_response.url()));
-                        }
-
-                        try_recursion(&new_ferox_response, base_depth, dir_chan.clone()).await;
-                    }
-                }
+                let _ = extractor.extract().await;
            }

            // everything else should be reported
@@ -506,61 +471,6 @@ pub fn send_report(report_sender: UnboundedSender<FeroxResponse>, response: Fero
    log::trace!("exit: send_report");
 }

-/// Request /robots.txt from given url
-async fn scan_robots_txt(
-    target_url: &str,
-    base_depth: usize,
-    stats: Arc<Stats>,
-    tx_term: UnboundedSender<FeroxResponse>,
-    tx_dir: UnboundedSender<String>,
-    tx_stats: UnboundedSender<StatCommand>,
-) {
-    log::trace!(
-        "enter: scan_robots_txt({}, {}, {:?}, {:?}, {:?}, {:?})",
-        target_url,
-        base_depth,
-        stats,
-        tx_term,
-        tx_dir,
-        tx_stats
-    );
-
-    let robots_links = extract_robots_txt(&target_url, &CONFIGURATION, tx_stats.clone()).await;
-
-    for robot_link in robots_links {
-        // create a url based on the given command line options, continue on error
-        let mut ferox_response =
-            match request_feroxresponse_from_new_link(&robot_link, tx_stats.clone()).await {
-                Some(resp) => resp,
-                None => continue,
-            };
-
-        if should_filter_response(&ferox_response, tx_stats.clone()) {
-            continue;
-        }
-
-        if ferox_response.is_file() {
-            log::debug!("File extracted from robots.txt: {}", ferox_response);
-            SCANNED_URLS.add_file_scan(&robot_link, stats.clone());
-            send_report(tx_term.clone(), ferox_response);
-        } else if !CONFIGURATION.no_recursion {
-            log::debug!("Directory extracted from robots.txt: {}", ferox_response);
-            // todo this code is essentially the same as another piece around ~467 of this file
-            if !ferox_response.url().as_str().ends_with('/')
-                && (ferox_response.status().is_success()
-                    || matches!(ferox_response.status(), &StatusCode::FORBIDDEN))
-            {
-                // if the url doesn't end with a /
-                // and the response code is either a 2xx or 403
-                ferox_response.set_url(&format!("{}/", ferox_response.url()));
-            }
-
-            try_recursion(&ferox_response, base_depth, tx_dir.clone()).await;
-        }
-    }
-    log::trace!("exit: scan_robots_txt");
-}
-
 /// Scan a given url using a given wordlist
 ///
 /// This is the primary entrypoint for the scanner
@@ -596,15 +506,20 @@ pub async fn scan_url(
        if CONFIGURATION.extract_links {
            // only grab robots.txt on the initial scan_url calls. all fresh dirs will be passed
            // to try_recursion
-            scan_robots_txt(
-                target_url,
-                base_depth,
-                stats.clone(),
-                tx_term.clone(),
-                tx_dir.clone(),
-                tx_stats.clone(),
-            )
-            .await;
+
+            let extractor = ExtractorBuilder::with_url(target_url)
+                .target(ExtractionTarget::RobotsTxt)
+                .depth(base_depth)
+                .config(&CONFIGURATION)
+                .recursion_transmitter(tx_dir.clone())
+                .stats_transmitter(tx_stats.clone())
+                .reporter_transmitter(tx_term.clone())
+                .scanned_urls(&SCANNED_URLS)
+                .stats(stats.clone())
+                .build()
+                .unwrap(); // todo change once this function returns Result
+
+            let _ = extractor.extract().await;
        }

        update_stat!(tx_stats, UpdateUsizeField(TotalScans, 1));
--- a/src/utils.rs
+++ b/src/utils.rs
@@ -5,7 +5,7 @@ use crate::{
        StatCommand::{self, AddError, AddStatus},
        StatError::{Connection, Other, Redirection, Request, Timeout, UrlFormat},
    },
-    FeroxError, FeroxResult,
+    FeroxError,
 };
 use anyhow::{bail, Context, Result};
 use console::{strip_ansi_codes, style, user_attended};
@@ -184,7 +184,7 @@ pub fn format_url(
    queries: &[(String, String)],
    extension: Option<&str>,
    tx_stats: UnboundedSender<StatCommand>,
-) -> FeroxResult<Url> {
+) -> Result<Url> {
    log::trace!(
        "enter: format_url({}, {}, {}, {:?} {:?}, {:?})",
        url,
@@ -214,7 +214,7 @@ pub fn format_url(
        update_stat!(tx_stats, AddError(UrlFormat));

        log::trace!("exit: format_url -> {}", err);
-        return Err(Box::new(err));
+        bail!("{}", err);
    }

    // from reqwest::Url::join
@@ -284,7 +284,7 @@ pub fn format_url(
            update_stat!(tx_stats, AddError(UrlFormat));
            log::trace!("exit: format_url -> {}", e);
            log::error!("Could not join {} with {}", word, base_url);
-            Err(Box::new(e))
+            bail!("{}", e)
        }
    }
 }