diff --git a/Cargo.lock b/Cargo.lock index 43521c4..3e7dd78 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -17,6 +17,15 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" +[[package]] +name = "aho-corasick" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0" +dependencies = [ + "memchr", +] + [[package]] name = "anstream" version = "0.6.13" @@ -893,10 +902,12 @@ checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8" name = "word_freq_analyzer" version = "0.1.0" dependencies = [ + "aho-corasick", "clap", "dashmap", "futures", "indicatif", + "once_cell", "serde_json", "tokio", "tracing", diff --git a/Cargo.toml b/Cargo.toml index 378247c..53b3454 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,10 +6,12 @@ edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] +aho-corasick = "1.1.2" clap = { version = "4.5.2", features = ["derive"] } dashmap = { version = "5.5.3", features = ["serde"] } futures = "0.3.30" indicatif = "0.17.8" +once_cell = "1.19.0" serde_json = "1.0.114" tokio = { version = "1.36.0", features = ["full"] } tracing = "0.1.40" diff --git a/src/main.rs b/src/main.rs index 51b54df..bbdb7e5 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,19 +1,20 @@ -use std::cell::RefCell; -use std::collections::{BTreeMap, HashMap, VecDeque}; +use std::collections::HashMap; use std::fmt::Write; use std::io; use std::os::unix::prelude::*; use std::path::PathBuf; -use std::rc::{Rc, Weak}; +use std::str; use std::sync::{ atomic::{AtomicBool, AtomicUsize, Ordering}, Arc, }; use std::time::Duration; +use aho_corasick::AhoCorasick; use clap::Parser; use dashmap::DashMap; use indicatif::{ProgressBar, ProgressState, ProgressStyle}; +use once_cell::sync::Lazy; use tokio::{fs, sync::Semaphore, time::sleep}; use tracing::Instrument; @@ -46,8 +47,7 @@ static KEYWORDS: &[&str] = &[ "物联网", "机器学习", ]; - -thread_local! { static ANALYZER: AhoCorasick = AhoCorasick::new(KEYWORDS); } +static ANALYZER: Lazy = Lazy::new(|| AhoCorasick::new(KEYWORDS.iter()).unwrap()); #[derive(Parser, Debug)] #[command(version, about, long_about = None)] @@ -109,16 +109,20 @@ async fn main() -> io::Result<()> { tracing::info!("Start to read file"); let buf = fs::read(&task.file).await?; let len = buf.len(); - let content = String::from_utf8_lossy(&buf); tracing::debug!("Start to analyze"); - let result = ANALYZER.with(|analyzer| analyzer.analyze(&content)); - for (word, count) in result.iter() { - tracing::trace!(word = %word, count = %count, "Analyzed"); + let mut result = HashMap::new(); + for mat in ANALYZER.find_iter(&buf) { + let word = str::from_utf8(&buf[mat.range()]).unwrap(); + tracing::trace!(word = %word, "Matched"); + result + .entry(word.to_string()) + .and_modify(|e| *e += 1) + .or_insert(1); analysis .entry(word.to_string()) - .and_modify(|e| *e += count) - .or_insert(*count); + .and_modify(|e| *e += 1) + .or_insert(1); } tracing::debug!("Finished analysis"); @@ -177,87 +181,3 @@ struct Task { file: PathBuf, size: usize, } - -#[derive(Default)] -pub struct AhoCorasick { - root: Rc>, -} - -impl AhoCorasick { - pub fn new(words: &[&str]) -> Self { - let root = Rc::new(RefCell::new(ACNode::default())); - for word in words { - let mut cur = Rc::clone(&root); - for c in word.chars() { - cur = Rc::clone(Rc::clone(&cur).borrow_mut().trans.entry(c).or_default()); - } - cur.borrow_mut().lengths.push(word.len()); - } - Self::build_suffix(Rc::clone(&root)); - Self { root } - } - - fn build_suffix(root: Rc>) { - let mut q = VecDeque::new(); - q.push_back(Rc::clone(&root)); - while let Some(parent) = q.pop_front() { - let parent = parent.borrow(); - for (c, child) in &parent.trans { - q.push_back(Rc::clone(child)); - let mut child = child.borrow_mut(); - let mut suffix = parent.suffix.upgrade(); - loop { - match &suffix { - None => { - child.lengths.extend(root.borrow().lengths.clone()); - child.suffix = Rc::downgrade(&root); - break; - } - Some(node) => { - if node.borrow().trans.contains_key(c) { - let node = &node.borrow().trans[c]; - child.lengths.extend(node.borrow().lengths.clone()); - child.suffix = Rc::downgrade(node); - break; - } - suffix = suffix.unwrap().borrow().suffix.upgrade(); - } - } - } - } - } - } - - pub fn analyze<'a>(&self, s: &'a str) -> HashMap<&'a str, usize> { - let mut ans = HashMap::new(); - let mut cur = Rc::clone(&self.root); - let mut position: usize = 0; - for c in s.chars() { - loop { - if let Some(child) = Rc::clone(&cur).borrow().trans.get(&c) { - cur = Rc::clone(child); - break; - } - let suffix = cur.borrow().suffix.clone(); - match suffix.upgrade() { - Some(node) => cur = node, - None => break, - } - } - position += c.len_utf8(); - for &len in &cur.borrow().lengths { - ans.entry(&s[position - len..position]) - .and_modify(|e| *e += 1) - .or_insert(1); - } - } - ans - } -} - -#[derive(Default)] -struct ACNode { - trans: BTreeMap>>, - suffix: Weak>, - lengths: Vec, -}