Compare commits
2 Commits
main
...
aho-corasi
Author | SHA1 | Date | |
---|---|---|---|
|
4d8990d37f | ||
4e1206a1bb |
11
Cargo.lock
generated
11
Cargo.lock
generated
@ -17,6 +17,15 @@ version = "1.0.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
|
||||
|
||||
[[package]]
|
||||
name = "aho-corasick"
|
||||
version = "1.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "anstream"
|
||||
version = "0.6.13"
|
||||
@ -893,10 +902,12 @@ checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8"
|
||||
name = "word_freq_analyzer"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"aho-corasick",
|
||||
"clap",
|
||||
"dashmap",
|
||||
"futures",
|
||||
"indicatif",
|
||||
"once_cell",
|
||||
"serde_json",
|
||||
"tokio",
|
||||
"tracing",
|
||||
|
@ -6,10 +6,12 @@ edition = "2021"
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
aho-corasick = "1.1.2"
|
||||
clap = { version = "4.5.2", features = ["derive"] }
|
||||
dashmap = { version = "5.5.3", features = ["serde"] }
|
||||
futures = "0.3.30"
|
||||
indicatif = "0.17.8"
|
||||
once_cell = "1.19.0"
|
||||
serde_json = "1.0.114"
|
||||
tokio = { version = "1.36.0", features = ["full"] }
|
||||
tracing = "0.1.40"
|
||||
|
117
src/main.rs
117
src/main.rs
@ -1,19 +1,20 @@
|
||||
use std::cell::RefCell;
|
||||
use std::collections::{BTreeMap, HashMap, VecDeque};
|
||||
use std::collections::HashMap;
|
||||
use std::fmt::Write;
|
||||
use std::io;
|
||||
use std::os::unix::prelude::*;
|
||||
use std::path::PathBuf;
|
||||
use std::rc::{Rc, Weak};
|
||||
use std::str;
|
||||
use std::sync::{
|
||||
atomic::{AtomicBool, AtomicUsize, Ordering},
|
||||
Arc,
|
||||
};
|
||||
use std::time::Duration;
|
||||
|
||||
use aho_corasick::{AhoCorasick, AhoCorasickKind, MatchKind};
|
||||
use clap::Parser;
|
||||
use dashmap::DashMap;
|
||||
use indicatif::{ProgressBar, ProgressState, ProgressStyle};
|
||||
use once_cell::sync::Lazy;
|
||||
use tokio::{fs, sync::Semaphore, time::sleep};
|
||||
use tracing::Instrument;
|
||||
|
||||
@ -46,8 +47,14 @@ static KEYWORDS: &[&str] = &[
|
||||
"物联网",
|
||||
"机器学习",
|
||||
];
|
||||
|
||||
thread_local! { static ANALYZER: AhoCorasick = AhoCorasick::new(KEYWORDS); }
|
||||
static ANALYZER: Lazy<AhoCorasick> = Lazy::new(|| {
|
||||
AhoCorasick::builder()
|
||||
.kind(Some(AhoCorasickKind::DFA))
|
||||
.match_kind(MatchKind::Standard)
|
||||
.prefilter(true)
|
||||
.build(KEYWORDS.iter())
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
#[derive(Parser, Debug)]
|
||||
#[command(version, about, long_about = None)]
|
||||
@ -109,16 +116,20 @@ async fn main() -> io::Result<()> {
|
||||
tracing::info!("Start to read file");
|
||||
let buf = fs::read(&task.file).await?;
|
||||
let len = buf.len();
|
||||
let content = String::from_utf8_lossy(&buf);
|
||||
|
||||
tracing::debug!("Start to analyze");
|
||||
let result = ANALYZER.with(|analyzer| analyzer.analyze(&content));
|
||||
for (word, count) in result.iter() {
|
||||
tracing::trace!(word = %word, count = %count, "Analyzed");
|
||||
let mut result = HashMap::new();
|
||||
for mat in ANALYZER.find_iter(&buf) {
|
||||
let word = str::from_utf8(&buf[mat.range()]).unwrap();
|
||||
tracing::trace!(word = %word, "Matched");
|
||||
result
|
||||
.entry(word.to_string())
|
||||
.and_modify(|e| *e += 1)
|
||||
.or_insert(1);
|
||||
analysis
|
||||
.entry(word.to_string())
|
||||
.and_modify(|e| *e += count)
|
||||
.or_insert(*count);
|
||||
.and_modify(|e| *e += 1)
|
||||
.or_insert(1);
|
||||
}
|
||||
tracing::debug!("Finished analysis");
|
||||
|
||||
@ -177,87 +188,3 @@ struct Task {
|
||||
file: PathBuf,
|
||||
size: usize,
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct AhoCorasick {
|
||||
root: Rc<RefCell<ACNode>>,
|
||||
}
|
||||
|
||||
impl AhoCorasick {
|
||||
pub fn new(words: &[&str]) -> Self {
|
||||
let root = Rc::new(RefCell::new(ACNode::default()));
|
||||
for word in words {
|
||||
let mut cur = Rc::clone(&root);
|
||||
for c in word.chars() {
|
||||
cur = Rc::clone(Rc::clone(&cur).borrow_mut().trans.entry(c).or_default());
|
||||
}
|
||||
cur.borrow_mut().lengths.push(word.len());
|
||||
}
|
||||
Self::build_suffix(Rc::clone(&root));
|
||||
Self { root }
|
||||
}
|
||||
|
||||
fn build_suffix(root: Rc<RefCell<ACNode>>) {
|
||||
let mut q = VecDeque::new();
|
||||
q.push_back(Rc::clone(&root));
|
||||
while let Some(parent) = q.pop_front() {
|
||||
let parent = parent.borrow();
|
||||
for (c, child) in &parent.trans {
|
||||
q.push_back(Rc::clone(child));
|
||||
let mut child = child.borrow_mut();
|
||||
let mut suffix = parent.suffix.upgrade();
|
||||
loop {
|
||||
match &suffix {
|
||||
None => {
|
||||
child.lengths.extend(root.borrow().lengths.clone());
|
||||
child.suffix = Rc::downgrade(&root);
|
||||
break;
|
||||
}
|
||||
Some(node) => {
|
||||
if node.borrow().trans.contains_key(c) {
|
||||
let node = &node.borrow().trans[c];
|
||||
child.lengths.extend(node.borrow().lengths.clone());
|
||||
child.suffix = Rc::downgrade(node);
|
||||
break;
|
||||
}
|
||||
suffix = suffix.unwrap().borrow().suffix.upgrade();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn analyze<'a>(&self, s: &'a str) -> HashMap<&'a str, usize> {
|
||||
let mut ans = HashMap::new();
|
||||
let mut cur = Rc::clone(&self.root);
|
||||
let mut position: usize = 0;
|
||||
for c in s.chars() {
|
||||
loop {
|
||||
if let Some(child) = Rc::clone(&cur).borrow().trans.get(&c) {
|
||||
cur = Rc::clone(child);
|
||||
break;
|
||||
}
|
||||
let suffix = cur.borrow().suffix.clone();
|
||||
match suffix.upgrade() {
|
||||
Some(node) => cur = node,
|
||||
None => break,
|
||||
}
|
||||
}
|
||||
position += c.len_utf8();
|
||||
for &len in &cur.borrow().lengths {
|
||||
ans.entry(&s[position - len..position])
|
||||
.and_modify(|e| *e += 1)
|
||||
.or_insert(1);
|
||||
}
|
||||
}
|
||||
ans
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
struct ACNode {
|
||||
trans: BTreeMap<char, Rc<RefCell<ACNode>>>,
|
||||
suffix: Weak<RefCell<ACNode>>,
|
||||
lengths: Vec<usize>,
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user