Add keywords and keywords_file to arguments

Signed-off-by: hr567 <hr567@hr567.me>
This commit is contained in:
2024-06-03 15:35:23 +08:00
parent f322ae675a
commit 733bef283e
5 changed files with 201 additions and 155 deletions

View File

@ -14,41 +14,10 @@ use std::time::Duration;
use clap::Parser;
use dashmap::DashMap;
use indicatif::{ProgressBar, ProgressState, ProgressStyle};
use once_cell::{sync::Lazy, unsync::OnceCell};
use tokio::{fs, sync::Semaphore, time::sleep};
use tracing::Instrument;
static KEYWORDS: &[&str] = &[
"数字经济",
"数字媒体",
"数字化",
"数字资产",
"数字化转型",
"数据管理",
"数据挖掘",
"数据网络",
"数据平台",
"数据中心",
"数据科学",
"数字控制",
"数字技术",
"数字通信",
"数字网络",
"数字智能",
"数字终端",
"数字营销",
"大数据",
"云计算",
"云IT",
"云生态",
"云服务",
"云平台",
"区块链",
"物联网",
"机器学习",
];
thread_local! { static ANALYZER: AhoCorasick = AhoCorasick::new(KEYWORDS); }
#[derive(Parser, Debug)]
#[command(version, about, long_about = None)]
struct Args {
@ -56,22 +25,31 @@ struct Args {
directory: PathBuf,
#[arg(short, long, default_value = "16")]
jobs: usize,
#[arg(short = 'f', long)]
keywords_file: Option<PathBuf>,
#[arg(last = true)]
keywords: Vec<String>,
}
static ARGS: Lazy<Args> = Lazy::new(Args::parse);
thread_local! {
static ANALYZER: OnceCell<AhoCorasick> = const { OnceCell::new() };
}
#[tokio::main]
async fn main() -> io::Result<()> {
tracing_subscriber::fmt::init();
let args = Args::parse();
tracing::debug!(
"start to analyze {} with {} jobs",
args.directory.display(),
args.jobs
ARGS.directory.display(),
ARGS.jobs
);
tracing::debug!("current keywords are {:?}", &ARGS.keywords);
let all_tasks = {
let mut res = Vec::new();
let mut files = fs::read_dir(&args.directory).await?;
let mut files = fs::read_dir(&ARGS.directory).await?;
while let Some(entry) = files.next_entry().await? {
if entry.path().extension().map_or(true, |ext| ext != "txt") {
continue;
@ -91,7 +69,7 @@ async fn main() -> io::Result<()> {
};
let analysis = Arc::new(DashMap::new());
let permits = Arc::new(Semaphore::new(args.jobs));
let permits = Arc::new(Semaphore::new(ARGS.jobs));
let total_size: usize = all_tasks.iter().map(|t| t.size).sum();
let analyzed_size = Arc::new(AtomicUsize::new(0));
tracing::info!("A total of {} bytes of text to analyze", total_size);
@ -112,7 +90,11 @@ async fn main() -> io::Result<()> {
let content = String::from_utf8_lossy(&buf);
tracing::debug!("Start to analyze");
let result = ANALYZER.with(|analyzer| analyzer.analyze(&content));
let result = ANALYZER.with(|analyzer| {
analyzer
.get_or_init(|| AhoCorasick::new(&ARGS.keywords))
.analyze(&content)
});
for (word, count) in result.iter() {
tracing::trace!(word = %word, count = %count, "Analyzed");
analysis
@ -184,9 +166,9 @@ pub struct AhoCorasick {
}
impl AhoCorasick {
pub fn new(words: &[&str]) -> Self {
pub fn new<S: AsRef<str>>(words: &[S]) -> Self {
let root = Rc::new(RefCell::new(ACNode::default()));
for word in words {
for word in words.iter().map(|s| s.as_ref()) {
let mut cur = Rc::clone(&root);
for c in word.chars() {
cur = Rc::clone(Rc::clone(&cur).borrow_mut().trans.entry(c).or_default());