Compare commits

..

2 Commits

Author SHA1 Message Date
hr567
4d8990d37f
Use DFA for better performance
Signed-off-by: hr567 <hr567@linux.alibaba.com>
2024-03-08 16:36:41 +08:00
4e1206a1bb
Use aho-corasick crate
Signed-off-by: hr567 <hr567@hr567.me>
2024-03-08 11:47:00 +08:00
3 changed files with 35 additions and 95 deletions

11
Cargo.lock generated
View File

@ -17,6 +17,15 @@ version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
[[package]]
name = "aho-corasick"
version = "1.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0"
dependencies = [
"memchr",
]
[[package]]
name = "anstream"
version = "0.6.13"
@ -893,10 +902,12 @@ checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8"
name = "word_freq_analyzer"
version = "0.1.0"
dependencies = [
"aho-corasick",
"clap",
"dashmap",
"futures",
"indicatif",
"once_cell",
"serde_json",
"tokio",
"tracing",

View File

@ -6,10 +6,12 @@ edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
aho-corasick = "1.1.2"
clap = { version = "4.5.2", features = ["derive"] }
dashmap = { version = "5.5.3", features = ["serde"] }
futures = "0.3.30"
indicatif = "0.17.8"
once_cell = "1.19.0"
serde_json = "1.0.114"
tokio = { version = "1.36.0", features = ["full"] }
tracing = "0.1.40"

View File

@ -1,19 +1,20 @@
use std::cell::RefCell;
use std::collections::{BTreeMap, HashMap, VecDeque};
use std::collections::HashMap;
use std::fmt::Write;
use std::io;
use std::os::unix::prelude::*;
use std::path::PathBuf;
use std::rc::{Rc, Weak};
use std::str;
use std::sync::{
atomic::{AtomicBool, AtomicUsize, Ordering},
Arc,
};
use std::time::Duration;
use aho_corasick::{AhoCorasick, AhoCorasickKind, MatchKind};
use clap::Parser;
use dashmap::DashMap;
use indicatif::{ProgressBar, ProgressState, ProgressStyle};
use once_cell::sync::Lazy;
use tokio::{fs, sync::Semaphore, time::sleep};
use tracing::Instrument;
@ -46,8 +47,14 @@ static KEYWORDS: &[&str] = &[
"物联网",
"机器学习",
];
thread_local! { static ANALYZER: AhoCorasick = AhoCorasick::new(KEYWORDS); }
static ANALYZER: Lazy<AhoCorasick> = Lazy::new(|| {
AhoCorasick::builder()
.kind(Some(AhoCorasickKind::DFA))
.match_kind(MatchKind::Standard)
.prefilter(true)
.build(KEYWORDS.iter())
.unwrap()
});
#[derive(Parser, Debug)]
#[command(version, about, long_about = None)]
@ -109,16 +116,20 @@ async fn main() -> io::Result<()> {
tracing::info!("Start to read file");
let buf = fs::read(&task.file).await?;
let len = buf.len();
let content = String::from_utf8_lossy(&buf);
tracing::debug!("Start to analyze");
let result = ANALYZER.with(|analyzer| analyzer.analyze(&content));
for (word, count) in result.iter() {
tracing::trace!(word = %word, count = %count, "Analyzed");
let mut result = HashMap::new();
for mat in ANALYZER.find_iter(&buf) {
let word = str::from_utf8(&buf[mat.range()]).unwrap();
tracing::trace!(word = %word, "Matched");
result
.entry(word.to_string())
.and_modify(|e| *e += 1)
.or_insert(1);
analysis
.entry(word.to_string())
.and_modify(|e| *e += count)
.or_insert(*count);
.and_modify(|e| *e += 1)
.or_insert(1);
}
tracing::debug!("Finished analysis");
@ -177,87 +188,3 @@ struct Task {
file: PathBuf,
size: usize,
}
#[derive(Default)]
pub struct AhoCorasick {
root: Rc<RefCell<ACNode>>,
}
impl AhoCorasick {
pub fn new(words: &[&str]) -> Self {
let root = Rc::new(RefCell::new(ACNode::default()));
for word in words {
let mut cur = Rc::clone(&root);
for c in word.chars() {
cur = Rc::clone(Rc::clone(&cur).borrow_mut().trans.entry(c).or_default());
}
cur.borrow_mut().lengths.push(word.len());
}
Self::build_suffix(Rc::clone(&root));
Self { root }
}
fn build_suffix(root: Rc<RefCell<ACNode>>) {
let mut q = VecDeque::new();
q.push_back(Rc::clone(&root));
while let Some(parent) = q.pop_front() {
let parent = parent.borrow();
for (c, child) in &parent.trans {
q.push_back(Rc::clone(child));
let mut child = child.borrow_mut();
let mut suffix = parent.suffix.upgrade();
loop {
match &suffix {
None => {
child.lengths.extend(root.borrow().lengths.clone());
child.suffix = Rc::downgrade(&root);
break;
}
Some(node) => {
if node.borrow().trans.contains_key(c) {
let node = &node.borrow().trans[c];
child.lengths.extend(node.borrow().lengths.clone());
child.suffix = Rc::downgrade(node);
break;
}
suffix = suffix.unwrap().borrow().suffix.upgrade();
}
}
}
}
}
}
pub fn analyze<'a>(&self, s: &'a str) -> HashMap<&'a str, usize> {
let mut ans = HashMap::new();
let mut cur = Rc::clone(&self.root);
let mut position: usize = 0;
for c in s.chars() {
loop {
if let Some(child) = Rc::clone(&cur).borrow().trans.get(&c) {
cur = Rc::clone(child);
break;
}
let suffix = cur.borrow().suffix.clone();
match suffix.upgrade() {
Some(node) => cur = node,
None => break,
}
}
position += c.len_utf8();
for &len in &cur.borrow().lengths {
ans.entry(&s[position - len..position])
.and_modify(|e| *e += 1)
.or_insert(1);
}
}
ans
}
}
#[derive(Default)]
struct ACNode {
trans: BTreeMap<char, Rc<RefCell<ACNode>>>,
suffix: Weak<RefCell<ACNode>>,
lengths: Vec<usize>,
}