Compare commits
No commits in common. "4d8990d37f3300c1c91e697451297e4ac37fdb38" and "f322ae675ad4feef1694b92cf6cd13ac26494026" have entirely different histories.
4d8990d37f
...
f322ae675a
11
Cargo.lock
generated
11
Cargo.lock
generated
@ -17,15 +17,6 @@ version = "1.0.2"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
|
checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "aho-corasick"
|
|
||||||
version = "1.1.2"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0"
|
|
||||||
dependencies = [
|
|
||||||
"memchr",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "anstream"
|
name = "anstream"
|
||||||
version = "0.6.13"
|
version = "0.6.13"
|
||||||
@ -902,12 +893,10 @@ checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8"
|
|||||||
name = "word_freq_analyzer"
|
name = "word_freq_analyzer"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"aho-corasick",
|
|
||||||
"clap",
|
"clap",
|
||||||
"dashmap",
|
"dashmap",
|
||||||
"futures",
|
"futures",
|
||||||
"indicatif",
|
"indicatif",
|
||||||
"once_cell",
|
|
||||||
"serde_json",
|
"serde_json",
|
||||||
"tokio",
|
"tokio",
|
||||||
"tracing",
|
"tracing",
|
||||||
|
@ -6,12 +6,10 @@ edition = "2021"
|
|||||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
aho-corasick = "1.1.2"
|
|
||||||
clap = { version = "4.5.2", features = ["derive"] }
|
clap = { version = "4.5.2", features = ["derive"] }
|
||||||
dashmap = { version = "5.5.3", features = ["serde"] }
|
dashmap = { version = "5.5.3", features = ["serde"] }
|
||||||
futures = "0.3.30"
|
futures = "0.3.30"
|
||||||
indicatif = "0.17.8"
|
indicatif = "0.17.8"
|
||||||
once_cell = "1.19.0"
|
|
||||||
serde_json = "1.0.114"
|
serde_json = "1.0.114"
|
||||||
tokio = { version = "1.36.0", features = ["full"] }
|
tokio = { version = "1.36.0", features = ["full"] }
|
||||||
tracing = "0.1.40"
|
tracing = "0.1.40"
|
||||||
|
117
src/main.rs
117
src/main.rs
@ -1,20 +1,19 @@
|
|||||||
use std::collections::HashMap;
|
use std::cell::RefCell;
|
||||||
|
use std::collections::{BTreeMap, HashMap, VecDeque};
|
||||||
use std::fmt::Write;
|
use std::fmt::Write;
|
||||||
use std::io;
|
use std::io;
|
||||||
use std::os::unix::prelude::*;
|
use std::os::unix::prelude::*;
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
use std::str;
|
use std::rc::{Rc, Weak};
|
||||||
use std::sync::{
|
use std::sync::{
|
||||||
atomic::{AtomicBool, AtomicUsize, Ordering},
|
atomic::{AtomicBool, AtomicUsize, Ordering},
|
||||||
Arc,
|
Arc,
|
||||||
};
|
};
|
||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
|
|
||||||
use aho_corasick::{AhoCorasick, AhoCorasickKind, MatchKind};
|
|
||||||
use clap::Parser;
|
use clap::Parser;
|
||||||
use dashmap::DashMap;
|
use dashmap::DashMap;
|
||||||
use indicatif::{ProgressBar, ProgressState, ProgressStyle};
|
use indicatif::{ProgressBar, ProgressState, ProgressStyle};
|
||||||
use once_cell::sync::Lazy;
|
|
||||||
use tokio::{fs, sync::Semaphore, time::sleep};
|
use tokio::{fs, sync::Semaphore, time::sleep};
|
||||||
use tracing::Instrument;
|
use tracing::Instrument;
|
||||||
|
|
||||||
@ -47,14 +46,8 @@ static KEYWORDS: &[&str] = &[
|
|||||||
"物联网",
|
"物联网",
|
||||||
"机器学习",
|
"机器学习",
|
||||||
];
|
];
|
||||||
static ANALYZER: Lazy<AhoCorasick> = Lazy::new(|| {
|
|
||||||
AhoCorasick::builder()
|
thread_local! { static ANALYZER: AhoCorasick = AhoCorasick::new(KEYWORDS); }
|
||||||
.kind(Some(AhoCorasickKind::DFA))
|
|
||||||
.match_kind(MatchKind::Standard)
|
|
||||||
.prefilter(true)
|
|
||||||
.build(KEYWORDS.iter())
|
|
||||||
.unwrap()
|
|
||||||
});
|
|
||||||
|
|
||||||
#[derive(Parser, Debug)]
|
#[derive(Parser, Debug)]
|
||||||
#[command(version, about, long_about = None)]
|
#[command(version, about, long_about = None)]
|
||||||
@ -116,20 +109,16 @@ async fn main() -> io::Result<()> {
|
|||||||
tracing::info!("Start to read file");
|
tracing::info!("Start to read file");
|
||||||
let buf = fs::read(&task.file).await?;
|
let buf = fs::read(&task.file).await?;
|
||||||
let len = buf.len();
|
let len = buf.len();
|
||||||
|
let content = String::from_utf8_lossy(&buf);
|
||||||
|
|
||||||
tracing::debug!("Start to analyze");
|
tracing::debug!("Start to analyze");
|
||||||
let mut result = HashMap::new();
|
let result = ANALYZER.with(|analyzer| analyzer.analyze(&content));
|
||||||
for mat in ANALYZER.find_iter(&buf) {
|
for (word, count) in result.iter() {
|
||||||
let word = str::from_utf8(&buf[mat.range()]).unwrap();
|
tracing::trace!(word = %word, count = %count, "Analyzed");
|
||||||
tracing::trace!(word = %word, "Matched");
|
|
||||||
result
|
|
||||||
.entry(word.to_string())
|
|
||||||
.and_modify(|e| *e += 1)
|
|
||||||
.or_insert(1);
|
|
||||||
analysis
|
analysis
|
||||||
.entry(word.to_string())
|
.entry(word.to_string())
|
||||||
.and_modify(|e| *e += 1)
|
.and_modify(|e| *e += count)
|
||||||
.or_insert(1);
|
.or_insert(*count);
|
||||||
}
|
}
|
||||||
tracing::debug!("Finished analysis");
|
tracing::debug!("Finished analysis");
|
||||||
|
|
||||||
@ -188,3 +177,87 @@ struct Task {
|
|||||||
file: PathBuf,
|
file: PathBuf,
|
||||||
size: usize,
|
size: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Default)]
|
||||||
|
pub struct AhoCorasick {
|
||||||
|
root: Rc<RefCell<ACNode>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl AhoCorasick {
|
||||||
|
pub fn new(words: &[&str]) -> Self {
|
||||||
|
let root = Rc::new(RefCell::new(ACNode::default()));
|
||||||
|
for word in words {
|
||||||
|
let mut cur = Rc::clone(&root);
|
||||||
|
for c in word.chars() {
|
||||||
|
cur = Rc::clone(Rc::clone(&cur).borrow_mut().trans.entry(c).or_default());
|
||||||
|
}
|
||||||
|
cur.borrow_mut().lengths.push(word.len());
|
||||||
|
}
|
||||||
|
Self::build_suffix(Rc::clone(&root));
|
||||||
|
Self { root }
|
||||||
|
}
|
||||||
|
|
||||||
|
fn build_suffix(root: Rc<RefCell<ACNode>>) {
|
||||||
|
let mut q = VecDeque::new();
|
||||||
|
q.push_back(Rc::clone(&root));
|
||||||
|
while let Some(parent) = q.pop_front() {
|
||||||
|
let parent = parent.borrow();
|
||||||
|
for (c, child) in &parent.trans {
|
||||||
|
q.push_back(Rc::clone(child));
|
||||||
|
let mut child = child.borrow_mut();
|
||||||
|
let mut suffix = parent.suffix.upgrade();
|
||||||
|
loop {
|
||||||
|
match &suffix {
|
||||||
|
None => {
|
||||||
|
child.lengths.extend(root.borrow().lengths.clone());
|
||||||
|
child.suffix = Rc::downgrade(&root);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
Some(node) => {
|
||||||
|
if node.borrow().trans.contains_key(c) {
|
||||||
|
let node = &node.borrow().trans[c];
|
||||||
|
child.lengths.extend(node.borrow().lengths.clone());
|
||||||
|
child.suffix = Rc::downgrade(node);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
suffix = suffix.unwrap().borrow().suffix.upgrade();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn analyze<'a>(&self, s: &'a str) -> HashMap<&'a str, usize> {
|
||||||
|
let mut ans = HashMap::new();
|
||||||
|
let mut cur = Rc::clone(&self.root);
|
||||||
|
let mut position: usize = 0;
|
||||||
|
for c in s.chars() {
|
||||||
|
loop {
|
||||||
|
if let Some(child) = Rc::clone(&cur).borrow().trans.get(&c) {
|
||||||
|
cur = Rc::clone(child);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
let suffix = cur.borrow().suffix.clone();
|
||||||
|
match suffix.upgrade() {
|
||||||
|
Some(node) => cur = node,
|
||||||
|
None => break,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
position += c.len_utf8();
|
||||||
|
for &len in &cur.borrow().lengths {
|
||||||
|
ans.entry(&s[position - len..position])
|
||||||
|
.and_modify(|e| *e += 1)
|
||||||
|
.or_insert(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ans
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Default)]
|
||||||
|
struct ACNode {
|
||||||
|
trans: BTreeMap<char, Rc<RefCell<ACNode>>>,
|
||||||
|
suffix: Weak<RefCell<ACNode>>,
|
||||||
|
lengths: Vec<usize>,
|
||||||
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user