Add Jieba segmentation for words count
Signed-off-by: hr567 <hr567@hr567.me>
This commit is contained in:
18
src/main.rs
18
src/main.rs
@ -3,7 +3,7 @@ use std::collections::{BTreeMap, HashMap, VecDeque};
|
||||
use std::env::current_dir;
|
||||
use std::fmt::Write;
|
||||
use std::fs::read_to_string;
|
||||
use std::io;
|
||||
use std::io::{self, Cursor};
|
||||
use std::path::PathBuf;
|
||||
use std::rc::{Rc, Weak};
|
||||
use std::sync::{
|
||||
@ -15,6 +15,7 @@ use std::time::Duration;
|
||||
use clap::Parser;
|
||||
use dashmap::DashMap;
|
||||
use indicatif::{ProgressBar, ProgressState, ProgressStyle};
|
||||
use jieba_rs::Jieba;
|
||||
use once_cell::{sync::Lazy, unsync::OnceCell};
|
||||
use tokio::{fs, sync::Semaphore, time::sleep};
|
||||
use tracing::Instrument;
|
||||
@ -40,6 +41,10 @@ static ARGS: Lazy<Args> = Lazy::new(|| {
|
||||
}
|
||||
args
|
||||
});
|
||||
static SEGMENTATION: Lazy<Jieba> = Lazy::new(|| {
|
||||
const DICT_CONTENT: &[u8] = include_bytes!("../dict.txt.big");
|
||||
Jieba::with_dict(&mut Cursor::new(DICT_CONTENT)).expect("failed to open jieba with dict")
|
||||
});
|
||||
thread_local! {
|
||||
static ANALYZER: OnceCell<AhoCorasick> = const { OnceCell::new() };
|
||||
}
|
||||
@ -98,10 +103,13 @@ async fn main() -> io::Result<()> {
|
||||
let content = String::from_utf8_lossy(&buf);
|
||||
|
||||
tracing::debug!("Start to analyze");
|
||||
let words_cnt = SEGMENTATION.cut(&content, true).len();
|
||||
let result = ANALYZER.with(|analyzer| {
|
||||
analyzer
|
||||
let mut res = analyzer
|
||||
.get_or_init(|| AhoCorasick::new(&ARGS.keywords))
|
||||
.analyze(&content)
|
||||
.analyze(&content);
|
||||
res.insert("_total", words_cnt);
|
||||
res
|
||||
});
|
||||
for (word, count) in result.iter() {
|
||||
tracing::trace!(word = %word, count = %count, "Analyzed");
|
||||
@ -110,13 +118,13 @@ async fn main() -> io::Result<()> {
|
||||
.and_modify(|e| *e += count)
|
||||
.or_insert(*count);
|
||||
}
|
||||
tracing::debug!("Finished analysis");
|
||||
|
||||
tracing::info!("Write result to file");
|
||||
let json_result = serde_json::to_vec(&result).unwrap();
|
||||
fs::write(task.file.with_extension("json"), json_result).await?;
|
||||
tracing::info!("Write result to file");
|
||||
|
||||
analyzed_size.fetch_add(len, Ordering::Release);
|
||||
tracing::debug!("Finished analysis");
|
||||
|
||||
io::Result::Ok(())
|
||||
}
|
||||
|
Reference in New Issue
Block a user