From bcfd8b6eefae0b9556de1c05c34d6275b7cede08 Mon Sep 17 00:00:00 2001 From: epi Date: Tue, 11 Jul 2023 06:23:18 -0400 Subject: [PATCH] fixed unwrap in nlp::document --- src/nlp/document.rs | 24 +++++++++++++----------- src/scanner/requester.rs | 13 +++++++------ 2 files changed, 20 insertions(+), 17 deletions(-) diff --git a/src/nlp/document.rs b/src/nlp/document.rs index 6a0244d..daeb785 100644 --- a/src/nlp/document.rs +++ b/src/nlp/document.rs @@ -40,15 +40,16 @@ impl Document { } /// create a new `Document` from the given HTML string - pub(crate) fn from_html(raw_html: &str) -> Self { + pub(crate) fn from_html(raw_html: &str) -> Option { let selector = Selector::parse("body").unwrap(); let html = Html::parse_document(raw_html); - let text = html - .select(&selector) - .next() - .unwrap() + let Some(element) = html.select(&selector).next() else { + return None; + }; + + let text = element .descendants() .filter_map(|node| { if !node.value().is_text() && !node.value().is_comment() { @@ -95,7 +96,7 @@ impl Document { // call `new` to push the parsed html through the pre-processing pipeline and process all // the words - Self::new(&text) + Some(Self::new(&text)) } /// Log normalized weighting scheme for term frequency @@ -146,19 +147,20 @@ mod tests { #[test] /// `Document::new` should preprocess html and generate a hashmap of `Term, TermMetadata` fn nlp_document_creation_from_html() { - let empty = Document::from_html(""); + let empty = Document::from_html("").unwrap(); assert_eq!(empty.number_of_terms, 0); - let other_empty = Document::from_html("

"); + let other_empty = Document::from_html("

").unwrap(); assert_eq!(other_empty.number_of_terms, 0); - let third_empty = Document::from_html("

"); + let third_empty = + Document::from_html("

").unwrap(); assert_eq!(third_empty.number_of_terms, 0); // p tag for is_text check and comment for is_comment let doc = Document::from_html( "

The air quality in Singapore.

", - ); + ).unwrap(); let expected_terms = ["air", "quality", "singapore", "worse", "wednesday"]; @@ -209,7 +211,7 @@ mod tests { /// ensure words in script/style tags aren't processed fn document_creation_skips_script_and_style_tags() { let html = "

got worse on Wednesday.

"; - let doc = Document::from_html(html); + let doc = Document::from_html(html).unwrap(); let keys = doc.terms().keys().map(|key| key.raw()).collect::>(); let expected = ["worse", "wednesday"]; diff --git a/src/scanner/requester.rs b/src/scanner/requester.rs index acdd03d..0517eca 100644 --- a/src/scanner/requester.rs +++ b/src/scanner/requester.rs @@ -475,12 +475,13 @@ impl Requester { if self.handles.config.collect_words { if let Ok(mut guard) = TF_IDF.write() { - let doc = Document::from_html(ferox_response.text()); - guard.add_document(doc); - if guard.num_documents() % 12 == 0 - || (guard.num_documents() < 5 && guard.num_documents() % 2 == 0) - { - guard.calculate_tf_idf_scores(); + if let Some(doc) = Document::from_html(ferox_response.text()) { + guard.add_document(doc); + if guard.num_documents() % 12 == 0 + || (guard.num_documents() < 5 && guard.num_documents() % 2 == 0) + { + guard.calculate_tf_idf_scores(); + } } } }