Use aho-corasick crate
Signed-off-by: hr567 <hr567@hr567.me>
This commit is contained in:
parent
f322ae675a
commit
4e1206a1bb
11
Cargo.lock
generated
11
Cargo.lock
generated
@ -17,6 +17,15 @@ version = "1.0.2"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
|
checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "aho-corasick"
|
||||||
|
version = "1.1.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0"
|
||||||
|
dependencies = [
|
||||||
|
"memchr",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "anstream"
|
name = "anstream"
|
||||||
version = "0.6.13"
|
version = "0.6.13"
|
||||||
@ -893,10 +902,12 @@ checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8"
|
|||||||
name = "word_freq_analyzer"
|
name = "word_freq_analyzer"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
|
"aho-corasick",
|
||||||
"clap",
|
"clap",
|
||||||
"dashmap",
|
"dashmap",
|
||||||
"futures",
|
"futures",
|
||||||
"indicatif",
|
"indicatif",
|
||||||
|
"once_cell",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
"tokio",
|
"tokio",
|
||||||
"tracing",
|
"tracing",
|
||||||
|
@ -6,10 +6,12 @@ edition = "2021"
|
|||||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
|
aho-corasick = "1.1.2"
|
||||||
clap = { version = "4.5.2", features = ["derive"] }
|
clap = { version = "4.5.2", features = ["derive"] }
|
||||||
dashmap = { version = "5.5.3", features = ["serde"] }
|
dashmap = { version = "5.5.3", features = ["serde"] }
|
||||||
futures = "0.3.30"
|
futures = "0.3.30"
|
||||||
indicatif = "0.17.8"
|
indicatif = "0.17.8"
|
||||||
|
once_cell = "1.19.0"
|
||||||
serde_json = "1.0.114"
|
serde_json = "1.0.114"
|
||||||
tokio = { version = "1.36.0", features = ["full"] }
|
tokio = { version = "1.36.0", features = ["full"] }
|
||||||
tracing = "0.1.40"
|
tracing = "0.1.40"
|
||||||
|
110
src/main.rs
110
src/main.rs
@ -1,19 +1,20 @@
|
|||||||
use std::cell::RefCell;
|
use std::collections::HashMap;
|
||||||
use std::collections::{BTreeMap, HashMap, VecDeque};
|
|
||||||
use std::fmt::Write;
|
use std::fmt::Write;
|
||||||
use std::io;
|
use std::io;
|
||||||
use std::os::unix::prelude::*;
|
use std::os::unix::prelude::*;
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
use std::rc::{Rc, Weak};
|
use std::str;
|
||||||
use std::sync::{
|
use std::sync::{
|
||||||
atomic::{AtomicBool, AtomicUsize, Ordering},
|
atomic::{AtomicBool, AtomicUsize, Ordering},
|
||||||
Arc,
|
Arc,
|
||||||
};
|
};
|
||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
|
|
||||||
|
use aho_corasick::AhoCorasick;
|
||||||
use clap::Parser;
|
use clap::Parser;
|
||||||
use dashmap::DashMap;
|
use dashmap::DashMap;
|
||||||
use indicatif::{ProgressBar, ProgressState, ProgressStyle};
|
use indicatif::{ProgressBar, ProgressState, ProgressStyle};
|
||||||
|
use once_cell::sync::Lazy;
|
||||||
use tokio::{fs, sync::Semaphore, time::sleep};
|
use tokio::{fs, sync::Semaphore, time::sleep};
|
||||||
use tracing::Instrument;
|
use tracing::Instrument;
|
||||||
|
|
||||||
@ -46,8 +47,7 @@ static KEYWORDS: &[&str] = &[
|
|||||||
"物联网",
|
"物联网",
|
||||||
"机器学习",
|
"机器学习",
|
||||||
];
|
];
|
||||||
|
static ANALYZER: Lazy<AhoCorasick> = Lazy::new(|| AhoCorasick::new(KEYWORDS.iter()).unwrap());
|
||||||
thread_local! { static ANALYZER: AhoCorasick = AhoCorasick::new(KEYWORDS); }
|
|
||||||
|
|
||||||
#[derive(Parser, Debug)]
|
#[derive(Parser, Debug)]
|
||||||
#[command(version, about, long_about = None)]
|
#[command(version, about, long_about = None)]
|
||||||
@ -109,16 +109,20 @@ async fn main() -> io::Result<()> {
|
|||||||
tracing::info!("Start to read file");
|
tracing::info!("Start to read file");
|
||||||
let buf = fs::read(&task.file).await?;
|
let buf = fs::read(&task.file).await?;
|
||||||
let len = buf.len();
|
let len = buf.len();
|
||||||
let content = String::from_utf8_lossy(&buf);
|
|
||||||
|
|
||||||
tracing::debug!("Start to analyze");
|
tracing::debug!("Start to analyze");
|
||||||
let result = ANALYZER.with(|analyzer| analyzer.analyze(&content));
|
let mut result = HashMap::new();
|
||||||
for (word, count) in result.iter() {
|
for mat in ANALYZER.find_iter(&buf) {
|
||||||
tracing::trace!(word = %word, count = %count, "Analyzed");
|
let word = str::from_utf8(&buf[mat.range()]).unwrap();
|
||||||
|
tracing::trace!(word = %word, "Matched");
|
||||||
|
result
|
||||||
|
.entry(word.to_string())
|
||||||
|
.and_modify(|e| *e += 1)
|
||||||
|
.or_insert(1);
|
||||||
analysis
|
analysis
|
||||||
.entry(word.to_string())
|
.entry(word.to_string())
|
||||||
.and_modify(|e| *e += count)
|
.and_modify(|e| *e += 1)
|
||||||
.or_insert(*count);
|
.or_insert(1);
|
||||||
}
|
}
|
||||||
tracing::debug!("Finished analysis");
|
tracing::debug!("Finished analysis");
|
||||||
|
|
||||||
@ -177,87 +181,3 @@ struct Task {
|
|||||||
file: PathBuf,
|
file: PathBuf,
|
||||||
size: usize,
|
size: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Default)]
|
|
||||||
pub struct AhoCorasick {
|
|
||||||
root: Rc<RefCell<ACNode>>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl AhoCorasick {
|
|
||||||
pub fn new(words: &[&str]) -> Self {
|
|
||||||
let root = Rc::new(RefCell::new(ACNode::default()));
|
|
||||||
for word in words {
|
|
||||||
let mut cur = Rc::clone(&root);
|
|
||||||
for c in word.chars() {
|
|
||||||
cur = Rc::clone(Rc::clone(&cur).borrow_mut().trans.entry(c).or_default());
|
|
||||||
}
|
|
||||||
cur.borrow_mut().lengths.push(word.len());
|
|
||||||
}
|
|
||||||
Self::build_suffix(Rc::clone(&root));
|
|
||||||
Self { root }
|
|
||||||
}
|
|
||||||
|
|
||||||
fn build_suffix(root: Rc<RefCell<ACNode>>) {
|
|
||||||
let mut q = VecDeque::new();
|
|
||||||
q.push_back(Rc::clone(&root));
|
|
||||||
while let Some(parent) = q.pop_front() {
|
|
||||||
let parent = parent.borrow();
|
|
||||||
for (c, child) in &parent.trans {
|
|
||||||
q.push_back(Rc::clone(child));
|
|
||||||
let mut child = child.borrow_mut();
|
|
||||||
let mut suffix = parent.suffix.upgrade();
|
|
||||||
loop {
|
|
||||||
match &suffix {
|
|
||||||
None => {
|
|
||||||
child.lengths.extend(root.borrow().lengths.clone());
|
|
||||||
child.suffix = Rc::downgrade(&root);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
Some(node) => {
|
|
||||||
if node.borrow().trans.contains_key(c) {
|
|
||||||
let node = &node.borrow().trans[c];
|
|
||||||
child.lengths.extend(node.borrow().lengths.clone());
|
|
||||||
child.suffix = Rc::downgrade(node);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
suffix = suffix.unwrap().borrow().suffix.upgrade();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn analyze<'a>(&self, s: &'a str) -> HashMap<&'a str, usize> {
|
|
||||||
let mut ans = HashMap::new();
|
|
||||||
let mut cur = Rc::clone(&self.root);
|
|
||||||
let mut position: usize = 0;
|
|
||||||
for c in s.chars() {
|
|
||||||
loop {
|
|
||||||
if let Some(child) = Rc::clone(&cur).borrow().trans.get(&c) {
|
|
||||||
cur = Rc::clone(child);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
let suffix = cur.borrow().suffix.clone();
|
|
||||||
match suffix.upgrade() {
|
|
||||||
Some(node) => cur = node,
|
|
||||||
None => break,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
position += c.len_utf8();
|
|
||||||
for &len in &cur.borrow().lengths {
|
|
||||||
ans.entry(&s[position - len..position])
|
|
||||||
.and_modify(|e| *e += 1)
|
|
||||||
.or_insert(1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
ans
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Default)]
|
|
||||||
struct ACNode {
|
|
||||||
trans: BTreeMap<char, Rc<RefCell<ACNode>>>,
|
|
||||||
suffix: Weak<RefCell<ACNode>>,
|
|
||||||
lengths: Vec<usize>,
|
|
||||||
}
|
|
||||||
|
Loading…
Reference in New Issue
Block a user