Initial commit
Signed-off-by: hr567 <hr567@hr567.me>
This commit is contained in:
commit
a71087840e
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
||||
/target
|
1052
Cargo.lock
generated
Normal file
1052
Cargo.lock
generated
Normal file
File diff suppressed because it is too large
Load Diff
29
Cargo.toml
Normal file
29
Cargo.toml
Normal file
@ -0,0 +1,29 @@
|
||||
[package]
|
||||
name = "default"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
clap = { version = "4.5.1", features = ["env", "derive"] }
|
||||
futures = "0.3.30"
|
||||
indicatif = "0.17.8"
|
||||
once_cell = "1.19.0"
|
||||
rand = "0.8.5"
|
||||
rand_distr = "0.4.3"
|
||||
serde = { version = "1.0.197", features = ["derive"] }
|
||||
serde_json = "1.0.114"
|
||||
tokio = { version = "1.36.0", features = ["full"] }
|
||||
tokio-stream = { version = "0.1.14", features = ["full"] }
|
||||
tracing = "0.1.40"
|
||||
tracing-subscriber = { version = "0.3.18", features = ["env-filter"] }
|
||||
|
||||
[profile.release]
|
||||
lto = true
|
||||
panic = "abort"
|
||||
codegen-units = 1
|
||||
debug = true
|
||||
split-debuginfo = "packed"
|
||||
|
||||
[build-dependencies]
|
236
src/main.rs
Normal file
236
src/main.rs
Normal file
@ -0,0 +1,236 @@
|
||||
use std::cell::RefCell;
|
||||
use std::collections::{BTreeMap, HashMap, VecDeque};
|
||||
use std::fmt::Write;
|
||||
use std::io;
|
||||
use std::os::unix::prelude::*;
|
||||
use std::path::PathBuf;
|
||||
use std::rc::{Rc, Weak};
|
||||
use std::sync::{
|
||||
atomic::{AtomicUsize, Ordering},
|
||||
Arc,
|
||||
};
|
||||
use std::time::Duration;
|
||||
|
||||
use clap::Parser;
|
||||
use indicatif::{ProgressBar, ProgressState, ProgressStyle};
|
||||
use tokio::{fs, io::AsyncReadExt, sync::Semaphore, time::sleep};
|
||||
|
||||
static KEYWORDS: &[&str] = &[
|
||||
"数字经济",
|
||||
"数字媒体",
|
||||
"数字化",
|
||||
"数字资产",
|
||||
"数字化转型",
|
||||
"数据管理",
|
||||
"数据挖掘",
|
||||
"数据网络",
|
||||
"数据平台",
|
||||
"数据中心",
|
||||
"数据科学",
|
||||
"数字控制",
|
||||
"数字技术",
|
||||
"数字通信",
|
||||
"数字网络",
|
||||
"数字智能",
|
||||
"数字终端",
|
||||
"数字营销",
|
||||
"大数据",
|
||||
"云计算",
|
||||
"云IT",
|
||||
"云生态",
|
||||
"云服务",
|
||||
"云平台",
|
||||
"区块链",
|
||||
"物联网",
|
||||
"机器学习",
|
||||
];
|
||||
thread_local! {
|
||||
static ANALYZER: AhoCorasick = AhoCorasick::new(KEYWORDS);
|
||||
}
|
||||
|
||||
#[derive(Parser, Debug)]
|
||||
#[command(version, about, long_about = None)]
|
||||
struct Args {
|
||||
#[arg(short, long)]
|
||||
directory: PathBuf,
|
||||
#[arg(short, long, default_value = "16")]
|
||||
jobs: usize,
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> io::Result<()> {
|
||||
tracing_subscriber::fmt::init();
|
||||
|
||||
let args = Args::parse();
|
||||
println!("{}", args.directory.display());
|
||||
|
||||
let all_tasks = {
|
||||
let mut res = Vec::new();
|
||||
let mut files = fs::read_dir(&args.directory).await?;
|
||||
while let Some(entry) = files.next_entry().await? {
|
||||
if entry.path().extension().map_or(false, |ext| ext == "txt") {
|
||||
tracing::debug!(
|
||||
"Text file {} has been added to task list",
|
||||
entry.path().display()
|
||||
);
|
||||
let metadata = entry.metadata().await?;
|
||||
let task = Task {
|
||||
file: entry.path(),
|
||||
size: metadata.size() as usize,
|
||||
};
|
||||
res.push(task);
|
||||
}
|
||||
}
|
||||
res
|
||||
};
|
||||
|
||||
let permits = Arc::new(Semaphore::new(args.jobs));
|
||||
let total_size: usize = all_tasks.iter().map(|t| t.size).sum();
|
||||
let analyzed_size = Arc::new(AtomicUsize::new(0));
|
||||
tracing::info!("A total of {} bytes of text to analyze", total_size);
|
||||
let mut tasks = Vec::new();
|
||||
for task in all_tasks.into_iter() {
|
||||
let analyzed_size = analyzed_size.clone();
|
||||
let permits = permits.clone();
|
||||
let task = tokio::spawn(async move {
|
||||
let _ = permits.acquire().await.unwrap();
|
||||
let mut buf = String::new();
|
||||
let len = fs::File::open(&task.file)
|
||||
.await?
|
||||
.read_to_string(&mut buf)
|
||||
.await?;
|
||||
let result = ANALYZER.with(|analyzer| analyzer.analyze(&buf));
|
||||
tracing::debug_span!("after analyzing").in_scope(|| {
|
||||
tracing::debug!("Analysis of file {}", &task.file.display());
|
||||
for (word, count) in result.iter() {
|
||||
tracing::debug!("{}: {}", word, count);
|
||||
}
|
||||
});
|
||||
let mut target_file = task.file;
|
||||
target_file.set_extension("json");
|
||||
let json_result = serde_json::to_vec(&result).unwrap();
|
||||
fs::write(target_file, json_result).await?;
|
||||
analyzed_size.fetch_add(len, Ordering::Release);
|
||||
io::Result::Ok(())
|
||||
});
|
||||
tasks.push(task);
|
||||
}
|
||||
|
||||
let progress = tokio::spawn(async move {
|
||||
let process = ProgressBar::new(total_size as u64);
|
||||
const TEMPLATE : &str = "{spinner:.green} [{elapsed_precise}] [{wide_bar:.cyan/blue}] {bytes}/{total_bytes} ({eta})";
|
||||
let style = ProgressStyle::with_template(TEMPLATE)
|
||||
.unwrap()
|
||||
.with_key("eta", |state: &ProgressState, w: &mut dyn Write| {
|
||||
write!(w, "{:.1}s", state.eta().as_secs_f64()).unwrap()
|
||||
})
|
||||
.progress_chars("#>-");
|
||||
process.set_style(style);
|
||||
while analyzed_size.load(Ordering::Acquire) < total_size {
|
||||
process.set_position(analyzed_size.load(Ordering::Acquire) as u64);
|
||||
sleep(Duration::from_millis(100)).await;
|
||||
}
|
||||
process.finish_with_message("All files have been processed");
|
||||
});
|
||||
let results = futures::future::join_all(tasks).await;
|
||||
if let Err(e) = progress.await {
|
||||
tracing::error!("failed to process all tasks: {}", e);
|
||||
}
|
||||
for res in results.into_iter().map(|r| r.unwrap()) {
|
||||
if let Err(e) = res {
|
||||
tracing::error!("failed to process a task: {}", e);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
struct Task {
|
||||
file: PathBuf,
|
||||
size: usize,
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct AhoCorasick {
|
||||
root: Rc<RefCell<ACNode>>,
|
||||
}
|
||||
|
||||
impl AhoCorasick {
|
||||
pub fn new(words: &[&str]) -> Self {
|
||||
let root = Rc::new(RefCell::new(ACNode::default()));
|
||||
for word in words {
|
||||
let mut cur = Rc::clone(&root);
|
||||
for c in word.chars() {
|
||||
cur = Rc::clone(Rc::clone(&cur).borrow_mut().trans.entry(c).or_default());
|
||||
}
|
||||
cur.borrow_mut().lengths.push(word.len());
|
||||
}
|
||||
Self::build_suffix(Rc::clone(&root));
|
||||
Self { root }
|
||||
}
|
||||
|
||||
fn build_suffix(root: Rc<RefCell<ACNode>>) {
|
||||
let mut q = VecDeque::new();
|
||||
q.push_back(Rc::clone(&root));
|
||||
while let Some(parent) = q.pop_front() {
|
||||
let parent = parent.borrow();
|
||||
for (c, child) in &parent.trans {
|
||||
q.push_back(Rc::clone(child));
|
||||
let mut child = child.borrow_mut();
|
||||
let mut suffix = parent.suffix.upgrade();
|
||||
loop {
|
||||
match &suffix {
|
||||
None => {
|
||||
child.lengths.extend(root.borrow().lengths.clone());
|
||||
child.suffix = Rc::downgrade(&root);
|
||||
break;
|
||||
}
|
||||
Some(node) => {
|
||||
if node.borrow().trans.contains_key(c) {
|
||||
let node = &node.borrow().trans[c];
|
||||
child.lengths.extend(node.borrow().lengths.clone());
|
||||
child.suffix = Rc::downgrade(node);
|
||||
break;
|
||||
} else {
|
||||
suffix = suffix.unwrap().borrow().suffix.upgrade();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn analyze<'a>(&self, s: &'a str) -> HashMap<&'a str, usize> {
|
||||
let mut ans = HashMap::new();
|
||||
let mut cur = Rc::clone(&self.root);
|
||||
let mut position: usize = 0;
|
||||
for c in s.chars() {
|
||||
loop {
|
||||
if let Some(child) = Rc::clone(&cur).borrow().trans.get(&c) {
|
||||
cur = Rc::clone(child);
|
||||
break;
|
||||
}
|
||||
let suffix = cur.borrow().suffix.clone();
|
||||
match suffix.upgrade() {
|
||||
Some(node) => cur = node,
|
||||
None => break,
|
||||
}
|
||||
}
|
||||
position += c.len_utf8();
|
||||
for &len in &cur.borrow().lengths {
|
||||
ans.entry(&s[position - len..position])
|
||||
.and_modify(|e| *e += 1)
|
||||
.or_insert(1);
|
||||
}
|
||||
}
|
||||
ans
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
struct ACNode {
|
||||
trans: BTreeMap<char, Rc<RefCell<ACNode>>>,
|
||||
suffix: Weak<RefCell<ACNode>>,
|
||||
lengths: Vec<usize>,
|
||||
}
|
Loading…
Reference in New Issue
Block a user