Add keywords and keywords_file to arguments
Signed-off-by: hr567 <hr567@hr567.me>
This commit is contained in:
62
src/main.rs
62
src/main.rs
@ -14,41 +14,10 @@ use std::time::Duration;
|
||||
use clap::Parser;
|
||||
use dashmap::DashMap;
|
||||
use indicatif::{ProgressBar, ProgressState, ProgressStyle};
|
||||
use once_cell::{sync::Lazy, unsync::OnceCell};
|
||||
use tokio::{fs, sync::Semaphore, time::sleep};
|
||||
use tracing::Instrument;
|
||||
|
||||
static KEYWORDS: &[&str] = &[
|
||||
"数字经济",
|
||||
"数字媒体",
|
||||
"数字化",
|
||||
"数字资产",
|
||||
"数字化转型",
|
||||
"数据管理",
|
||||
"数据挖掘",
|
||||
"数据网络",
|
||||
"数据平台",
|
||||
"数据中心",
|
||||
"数据科学",
|
||||
"数字控制",
|
||||
"数字技术",
|
||||
"数字通信",
|
||||
"数字网络",
|
||||
"数字智能",
|
||||
"数字终端",
|
||||
"数字营销",
|
||||
"大数据",
|
||||
"云计算",
|
||||
"云IT",
|
||||
"云生态",
|
||||
"云服务",
|
||||
"云平台",
|
||||
"区块链",
|
||||
"物联网",
|
||||
"机器学习",
|
||||
];
|
||||
|
||||
thread_local! { static ANALYZER: AhoCorasick = AhoCorasick::new(KEYWORDS); }
|
||||
|
||||
#[derive(Parser, Debug)]
|
||||
#[command(version, about, long_about = None)]
|
||||
struct Args {
|
||||
@ -56,22 +25,31 @@ struct Args {
|
||||
directory: PathBuf,
|
||||
#[arg(short, long, default_value = "16")]
|
||||
jobs: usize,
|
||||
#[arg(short = 'f', long)]
|
||||
keywords_file: Option<PathBuf>,
|
||||
#[arg(last = true)]
|
||||
keywords: Vec<String>,
|
||||
}
|
||||
|
||||
static ARGS: Lazy<Args> = Lazy::new(Args::parse);
|
||||
thread_local! {
|
||||
static ANALYZER: OnceCell<AhoCorasick> = const { OnceCell::new() };
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> io::Result<()> {
|
||||
tracing_subscriber::fmt::init();
|
||||
|
||||
let args = Args::parse();
|
||||
tracing::debug!(
|
||||
"start to analyze {} with {} jobs",
|
||||
args.directory.display(),
|
||||
args.jobs
|
||||
ARGS.directory.display(),
|
||||
ARGS.jobs
|
||||
);
|
||||
tracing::debug!("current keywords are {:?}", &ARGS.keywords);
|
||||
|
||||
let all_tasks = {
|
||||
let mut res = Vec::new();
|
||||
let mut files = fs::read_dir(&args.directory).await?;
|
||||
let mut files = fs::read_dir(&ARGS.directory).await?;
|
||||
while let Some(entry) = files.next_entry().await? {
|
||||
if entry.path().extension().map_or(true, |ext| ext != "txt") {
|
||||
continue;
|
||||
@ -91,7 +69,7 @@ async fn main() -> io::Result<()> {
|
||||
};
|
||||
|
||||
let analysis = Arc::new(DashMap::new());
|
||||
let permits = Arc::new(Semaphore::new(args.jobs));
|
||||
let permits = Arc::new(Semaphore::new(ARGS.jobs));
|
||||
let total_size: usize = all_tasks.iter().map(|t| t.size).sum();
|
||||
let analyzed_size = Arc::new(AtomicUsize::new(0));
|
||||
tracing::info!("A total of {} bytes of text to analyze", total_size);
|
||||
@ -112,7 +90,11 @@ async fn main() -> io::Result<()> {
|
||||
let content = String::from_utf8_lossy(&buf);
|
||||
|
||||
tracing::debug!("Start to analyze");
|
||||
let result = ANALYZER.with(|analyzer| analyzer.analyze(&content));
|
||||
let result = ANALYZER.with(|analyzer| {
|
||||
analyzer
|
||||
.get_or_init(|| AhoCorasick::new(&ARGS.keywords))
|
||||
.analyze(&content)
|
||||
});
|
||||
for (word, count) in result.iter() {
|
||||
tracing::trace!(word = %word, count = %count, "Analyzed");
|
||||
analysis
|
||||
@ -184,9 +166,9 @@ pub struct AhoCorasick {
|
||||
}
|
||||
|
||||
impl AhoCorasick {
|
||||
pub fn new(words: &[&str]) -> Self {
|
||||
pub fn new<S: AsRef<str>>(words: &[S]) -> Self {
|
||||
let root = Rc::new(RefCell::new(ACNode::default()));
|
||||
for word in words {
|
||||
for word in words.iter().map(|s| s.as_ref()) {
|
||||
let mut cur = Rc::clone(&root);
|
||||
for c in word.chars() {
|
||||
cur = Rc::clone(Rc::clone(&cur).borrow_mut().trans.entry(c).or_default());
|
||||
|
Reference in New Issue
Block a user