Compare commits

..

No commits in common. "4d8990d37f3300c1c91e697451297e4ac37fdb38" and "f322ae675ad4feef1694b92cf6cd13ac26494026" have entirely different histories.

3 changed files with 95 additions and 35 deletions

11
Cargo.lock generated
View File

@ -17,15 +17,6 @@ version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
[[package]]
name = "aho-corasick"
version = "1.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0"
dependencies = [
"memchr",
]
[[package]] [[package]]
name = "anstream" name = "anstream"
version = "0.6.13" version = "0.6.13"
@ -902,12 +893,10 @@ checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8"
name = "word_freq_analyzer" name = "word_freq_analyzer"
version = "0.1.0" version = "0.1.0"
dependencies = [ dependencies = [
"aho-corasick",
"clap", "clap",
"dashmap", "dashmap",
"futures", "futures",
"indicatif", "indicatif",
"once_cell",
"serde_json", "serde_json",
"tokio", "tokio",
"tracing", "tracing",

View File

@ -6,12 +6,10 @@ edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies] [dependencies]
aho-corasick = "1.1.2"
clap = { version = "4.5.2", features = ["derive"] } clap = { version = "4.5.2", features = ["derive"] }
dashmap = { version = "5.5.3", features = ["serde"] } dashmap = { version = "5.5.3", features = ["serde"] }
futures = "0.3.30" futures = "0.3.30"
indicatif = "0.17.8" indicatif = "0.17.8"
once_cell = "1.19.0"
serde_json = "1.0.114" serde_json = "1.0.114"
tokio = { version = "1.36.0", features = ["full"] } tokio = { version = "1.36.0", features = ["full"] }
tracing = "0.1.40" tracing = "0.1.40"

View File

@ -1,20 +1,19 @@
use std::collections::HashMap; use std::cell::RefCell;
use std::collections::{BTreeMap, HashMap, VecDeque};
use std::fmt::Write; use std::fmt::Write;
use std::io; use std::io;
use std::os::unix::prelude::*; use std::os::unix::prelude::*;
use std::path::PathBuf; use std::path::PathBuf;
use std::str; use std::rc::{Rc, Weak};
use std::sync::{ use std::sync::{
atomic::{AtomicBool, AtomicUsize, Ordering}, atomic::{AtomicBool, AtomicUsize, Ordering},
Arc, Arc,
}; };
use std::time::Duration; use std::time::Duration;
use aho_corasick::{AhoCorasick, AhoCorasickKind, MatchKind};
use clap::Parser; use clap::Parser;
use dashmap::DashMap; use dashmap::DashMap;
use indicatif::{ProgressBar, ProgressState, ProgressStyle}; use indicatif::{ProgressBar, ProgressState, ProgressStyle};
use once_cell::sync::Lazy;
use tokio::{fs, sync::Semaphore, time::sleep}; use tokio::{fs, sync::Semaphore, time::sleep};
use tracing::Instrument; use tracing::Instrument;
@ -47,14 +46,8 @@ static KEYWORDS: &[&str] = &[
"物联网", "物联网",
"机器学习", "机器学习",
]; ];
static ANALYZER: Lazy<AhoCorasick> = Lazy::new(|| {
AhoCorasick::builder() thread_local! { static ANALYZER: AhoCorasick = AhoCorasick::new(KEYWORDS); }
.kind(Some(AhoCorasickKind::DFA))
.match_kind(MatchKind::Standard)
.prefilter(true)
.build(KEYWORDS.iter())
.unwrap()
});
#[derive(Parser, Debug)] #[derive(Parser, Debug)]
#[command(version, about, long_about = None)] #[command(version, about, long_about = None)]
@ -116,20 +109,16 @@ async fn main() -> io::Result<()> {
tracing::info!("Start to read file"); tracing::info!("Start to read file");
let buf = fs::read(&task.file).await?; let buf = fs::read(&task.file).await?;
let len = buf.len(); let len = buf.len();
let content = String::from_utf8_lossy(&buf);
tracing::debug!("Start to analyze"); tracing::debug!("Start to analyze");
let mut result = HashMap::new(); let result = ANALYZER.with(|analyzer| analyzer.analyze(&content));
for mat in ANALYZER.find_iter(&buf) { for (word, count) in result.iter() {
let word = str::from_utf8(&buf[mat.range()]).unwrap(); tracing::trace!(word = %word, count = %count, "Analyzed");
tracing::trace!(word = %word, "Matched");
result
.entry(word.to_string())
.and_modify(|e| *e += 1)
.or_insert(1);
analysis analysis
.entry(word.to_string()) .entry(word.to_string())
.and_modify(|e| *e += 1) .and_modify(|e| *e += count)
.or_insert(1); .or_insert(*count);
} }
tracing::debug!("Finished analysis"); tracing::debug!("Finished analysis");
@ -188,3 +177,87 @@ struct Task {
file: PathBuf, file: PathBuf,
size: usize, size: usize,
} }
#[derive(Default)]
pub struct AhoCorasick {
root: Rc<RefCell<ACNode>>,
}
impl AhoCorasick {
pub fn new(words: &[&str]) -> Self {
let root = Rc::new(RefCell::new(ACNode::default()));
for word in words {
let mut cur = Rc::clone(&root);
for c in word.chars() {
cur = Rc::clone(Rc::clone(&cur).borrow_mut().trans.entry(c).or_default());
}
cur.borrow_mut().lengths.push(word.len());
}
Self::build_suffix(Rc::clone(&root));
Self { root }
}
fn build_suffix(root: Rc<RefCell<ACNode>>) {
let mut q = VecDeque::new();
q.push_back(Rc::clone(&root));
while let Some(parent) = q.pop_front() {
let parent = parent.borrow();
for (c, child) in &parent.trans {
q.push_back(Rc::clone(child));
let mut child = child.borrow_mut();
let mut suffix = parent.suffix.upgrade();
loop {
match &suffix {
None => {
child.lengths.extend(root.borrow().lengths.clone());
child.suffix = Rc::downgrade(&root);
break;
}
Some(node) => {
if node.borrow().trans.contains_key(c) {
let node = &node.borrow().trans[c];
child.lengths.extend(node.borrow().lengths.clone());
child.suffix = Rc::downgrade(node);
break;
}
suffix = suffix.unwrap().borrow().suffix.upgrade();
}
}
}
}
}
}
pub fn analyze<'a>(&self, s: &'a str) -> HashMap<&'a str, usize> {
let mut ans = HashMap::new();
let mut cur = Rc::clone(&self.root);
let mut position: usize = 0;
for c in s.chars() {
loop {
if let Some(child) = Rc::clone(&cur).borrow().trans.get(&c) {
cur = Rc::clone(child);
break;
}
let suffix = cur.borrow().suffix.clone();
match suffix.upgrade() {
Some(node) => cur = node,
None => break,
}
}
position += c.len_utf8();
for &len in &cur.borrow().lengths {
ans.entry(&s[position - len..position])
.and_modify(|e| *e += 1)
.or_insert(1);
}
}
ans
}
}
#[derive(Default)]
struct ACNode {
trans: BTreeMap<char, Rc<RefCell<ACNode>>>,
suffix: Weak<RefCell<ACNode>>,
lengths: Vec<usize>,
}