Add Jieba segmentation for words count

Signed-off-by: hr567 <hr567@hr567.me>
This commit is contained in:
2024-06-11 16:25:32 +08:00
parent 7e8f4f8a00
commit a536e33580
4 changed files with 584694 additions and 139 deletions

View File

@ -3,7 +3,7 @@ use std::collections::{BTreeMap, HashMap, VecDeque};
use std::env::current_dir;
use std::fmt::Write;
use std::fs::read_to_string;
use std::io;
use std::io::{self, Cursor};
use std::path::PathBuf;
use std::rc::{Rc, Weak};
use std::sync::{
@ -15,6 +15,7 @@ use std::time::Duration;
use clap::Parser;
use dashmap::DashMap;
use indicatif::{ProgressBar, ProgressState, ProgressStyle};
use jieba_rs::Jieba;
use once_cell::{sync::Lazy, unsync::OnceCell};
use tokio::{fs, sync::Semaphore, time::sleep};
use tracing::Instrument;
@ -40,6 +41,10 @@ static ARGS: Lazy<Args> = Lazy::new(|| {
}
args
});
static SEGMENTATION: Lazy<Jieba> = Lazy::new(|| {
const DICT_CONTENT: &[u8] = include_bytes!("../dict.txt.big");
Jieba::with_dict(&mut Cursor::new(DICT_CONTENT)).expect("failed to open jieba with dict")
});
thread_local! {
static ANALYZER: OnceCell<AhoCorasick> = const { OnceCell::new() };
}
@ -98,10 +103,13 @@ async fn main() -> io::Result<()> {
let content = String::from_utf8_lossy(&buf);
tracing::debug!("Start to analyze");
let words_cnt = SEGMENTATION.cut(&content, true).len();
let result = ANALYZER.with(|analyzer| {
analyzer
let mut res = analyzer
.get_or_init(|| AhoCorasick::new(&ARGS.keywords))
.analyze(&content)
.analyze(&content);
res.insert("_total", words_cnt);
res
});
for (word, count) in result.iter() {
tracing::trace!(word = %word, count = %count, "Analyzed");
@ -110,13 +118,13 @@ async fn main() -> io::Result<()> {
.and_modify(|e| *e += count)
.or_insert(*count);
}
tracing::debug!("Finished analysis");
tracing::info!("Write result to file");
let json_result = serde_json::to_vec(&result).unwrap();
fs::write(task.file.with_extension("json"), json_result).await?;
tracing::info!("Write result to file");
analyzed_size.fetch_add(len, Ordering::Release);
tracing::debug!("Finished analysis");
io::Result::Ok(())
}