Compare commits

...

6 Commits

Author SHA1 Message Date
a536e33580
Add Jieba segmentation for words count
Signed-off-by: hr567 <hr567@hr567.me>
2024-06-11 16:25:37 +08:00
7e8f4f8a00
Use typed arguments for default values
Signed-off-by: hr567 <hr567@hr567.me>
2024-06-04 14:45:50 +08:00
72574323ba
Add default value for analysis dir
Signed-off-by: hr567 <hr567@hr567.me>
2024-06-04 11:17:25 +08:00
caafa8f715
Implement keywords_file option
Signed-off-by: hr567 <hr567@hr567.me>
2024-06-03 17:10:38 +08:00
1727bd8e6e
Use metadata.len() istead of metadata.size() for other platforms
Signed-off-by: hr567 <hr567@hr567.me>
2024-06-03 16:00:09 +08:00
733bef283e
Add keywords and keywords_file to arguments
Signed-off-by: hr567 <hr567@hr567.me>
2024-06-03 15:35:23 +08:00
6 changed files with 584881 additions and 272 deletions

567
Cargo.lock generated
View File

@ -4,9 +4,9 @@ version = 3
[[package]] [[package]]
name = "addr2line" name = "addr2line"
version = "0.21.0" version = "0.22.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8a30b2e23b9e17a9f90641c7ab1549cd9b44f296d3ccbf309d2863cfe398a0cb" checksum = "6e4503c46a5c0c7844e948c9a4d6acd9f50cccb4de1c48eb9e291ea17470c678"
dependencies = [ dependencies = [
"gimli", "gimli",
] ]
@ -18,64 +18,74 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
[[package]] [[package]]
name = "anstream" name = "aho-corasick"
version = "0.6.13" version = "1.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d96bd03f33fe50a863e394ee9718a706f988b9079b20c3784fb726e7678b62fb" checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916"
dependencies = [
"memchr",
]
[[package]]
name = "anstream"
version = "0.6.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "418c75fa768af9c03be99d17643f93f79bbba589895012a80e3452a19ddda15b"
dependencies = [ dependencies = [
"anstyle", "anstyle",
"anstyle-parse", "anstyle-parse",
"anstyle-query", "anstyle-query",
"anstyle-wincon", "anstyle-wincon",
"colorchoice", "colorchoice",
"is_terminal_polyfill",
"utf8parse", "utf8parse",
] ]
[[package]] [[package]]
name = "anstyle" name = "anstyle"
version = "1.0.6" version = "1.0.7"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8901269c6307e8d93993578286ac0edf7f195079ffff5ebdeea6a59ffb7e36bc" checksum = "038dfcf04a5feb68e9c60b21c9625a54c2c0616e79b72b0fd87075a056ae1d1b"
[[package]] [[package]]
name = "anstyle-parse" name = "anstyle-parse"
version = "0.2.3" version = "0.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c75ac65da39e5fe5ab759307499ddad880d724eed2f6ce5b5e8a26f4f387928c" checksum = "c03a11a9034d92058ceb6ee011ce58af4a9bf61491aa7e1e59ecd24bd40d22d4"
dependencies = [ dependencies = [
"utf8parse", "utf8parse",
] ]
[[package]] [[package]]
name = "anstyle-query" name = "anstyle-query"
version = "1.0.2" version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e28923312444cdd728e4738b3f9c9cac739500909bb3d3c94b43551b16517648" checksum = "ad186efb764318d35165f1758e7dcef3b10628e26d41a44bc5550652e6804391"
dependencies = [ dependencies = [
"windows-sys 0.52.0", "windows-sys",
] ]
[[package]] [[package]]
name = "anstyle-wincon" name = "anstyle-wincon"
version = "3.0.2" version = "3.0.3"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1cd54b81ec8d6180e24654d0b371ad22fc3dd083b6ff8ba325b72e00c87660a7" checksum = "61a38449feb7068f52bb06c12759005cf459ee52bb4adc1d5a7c4322d716fb19"
dependencies = [ dependencies = [
"anstyle", "anstyle",
"windows-sys 0.52.0", "windows-sys",
] ]
[[package]] [[package]]
name = "autocfg" name = "autocfg"
version = "1.1.0" version = "1.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" checksum = "0c4b4d0bd25bd0b74681c0ad21497610ce1b7c91b1022cd21c80c6fbdd9476b0"
[[package]] [[package]]
name = "backtrace" name = "backtrace"
version = "0.3.69" version = "0.3.72"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2089b7e3f35b9dd2d0ed921ead4f6d318c27680d4a5bd167b3ee120edb105837" checksum = "17c6a35df3749d2e8bb1b7b21a976d82b15548788d2735b9d82f329268f71a11"
dependencies = [ dependencies = [
"addr2line", "addr2line",
"cc", "cc",
@ -88,21 +98,36 @@ dependencies = [
[[package]] [[package]]
name = "bitflags" name = "bitflags"
version = "1.3.2" version = "2.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" checksum = "cf4b9d6a944f767f8e5e0db018570623c85f3d925ac718db4e06d0187adb21c1"
[[package]]
name = "byteorder"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
[[package]] [[package]]
name = "bytes" name = "bytes"
version = "1.5.0" version = "1.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a2bd12c1caf447e69cd4528f47f94d203fd2582878ecb9e9465484c4148a8223" checksum = "514de17de45fdb8dc022b1a7975556c53c86f9f0aa5f534b98977b171857c2c9"
[[package]] [[package]]
name = "cc" name = "cc"
version = "1.0.90" version = "1.0.99"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8cd6604a82acf3039f1144f54b8eb34e91ffba622051189e71b781822d5ee1f5" checksum = "96c51067fd44124faa7f870b4b1c969379ad32b2ba805aa959430ceaa384f695"
[[package]]
name = "cedarwood"
version = "0.4.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6d910bedd62c24733263d0bed247460853c9d22e8956bd4cd964302095e04e90"
dependencies = [
"smallvec",
]
[[package]] [[package]]
name = "cfg-if" name = "cfg-if"
@ -112,9 +137,9 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]] [[package]]
name = "clap" name = "clap"
version = "4.5.2" version = "4.5.7"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b230ab84b0ffdf890d5a10abdbc8b83ae1c4918275daea1ab8801f71536b2651" checksum = "5db83dced34638ad474f39f250d7fea9598bdd239eaced1bdf45d597da0f433f"
dependencies = [ dependencies = [
"clap_builder", "clap_builder",
"clap_derive", "clap_derive",
@ -122,9 +147,9 @@ dependencies = [
[[package]] [[package]]
name = "clap_builder" name = "clap_builder"
version = "4.5.2" version = "4.5.7"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ae129e2e766ae0ec03484e609954119f123cc1fe650337e155d03b022f24f7b4" checksum = "f7e204572485eb3fbf28f871612191521df159bc3e15a9f5064c66dba3a8c05f"
dependencies = [ dependencies = [
"anstream", "anstream",
"anstyle", "anstyle",
@ -134,9 +159,9 @@ dependencies = [
[[package]] [[package]]
name = "clap_derive" name = "clap_derive"
version = "4.5.0" version = "4.5.5"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "307bc0538d5f0f83b8248db3087aa92fe504e4691294d0c96c0eabc33f47ba47" checksum = "c780290ccf4fb26629baa7a1081e68ced113f1d3ec302fa5948f1c381ebf06c6"
dependencies = [ dependencies = [
"heck", "heck",
"proc-macro2", "proc-macro2",
@ -146,15 +171,15 @@ dependencies = [
[[package]] [[package]]
name = "clap_lex" name = "clap_lex"
version = "0.7.0" version = "0.7.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "98cc8fbded0c607b7ba9dd60cd98df59af97e84d24e49c8557331cfc26d301ce" checksum = "4b82cf0babdbd58558212896d1a4272303a57bdb245c2bf1147185fb45640e70"
[[package]] [[package]]
name = "colorchoice" name = "colorchoice"
version = "1.0.0" version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7" checksum = "0b6a852b24ab71dffc585bcb46eaf7959d175cb865a7152e35b348d1b2960422"
[[package]] [[package]]
name = "console" name = "console"
@ -166,7 +191,42 @@ dependencies = [
"lazy_static", "lazy_static",
"libc", "libc",
"unicode-width", "unicode-width",
"windows-sys 0.52.0", "windows-sys",
]
[[package]]
name = "darling"
version = "0.20.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "83b2eb4d90d12bdda5ed17de686c2acb4c57914f8f921b8da7e112b5a36f3fe1"
dependencies = [
"darling_core",
"darling_macro",
]
[[package]]
name = "darling_core"
version = "0.20.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "622687fe0bac72a04e5599029151f5796111b90f1baaa9b544d807a5e31cd120"
dependencies = [
"fnv",
"ident_case",
"proc-macro2",
"quote",
"strsim",
"syn",
]
[[package]]
name = "darling_macro"
version = "0.20.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "733cabb43482b1a1b53eee8583c2b9e8684d592215ea83efd305dd31bc2f0178"
dependencies = [
"darling_core",
"quote",
"syn",
] ]
[[package]] [[package]]
@ -183,12 +243,49 @@ dependencies = [
"serde", "serde",
] ]
[[package]]
name = "derive_builder"
version = "0.20.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0350b5cb0331628a5916d6c5c0b72e97393b8b6b03b47a9284f4e7f5a405ffd7"
dependencies = [
"derive_builder_macro",
]
[[package]]
name = "derive_builder_core"
version = "0.20.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d48cda787f839151732d396ac69e3473923d54312c070ee21e9effcaa8ca0b1d"
dependencies = [
"darling",
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "derive_builder_macro"
version = "0.20.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "206868b8242f27cecce124c19fd88157fbd0dd334df2587f36417bafbc85097b"
dependencies = [
"derive_builder_core",
"syn",
]
[[package]] [[package]]
name = "encode_unicode" name = "encode_unicode"
version = "0.3.6" version = "0.3.6"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a357d28ed41a50f9c765dbfe56cbc04a64e53e5fc58ba79fbc34c10ef3df831f" checksum = "a357d28ed41a50f9c765dbfe56cbc04a64e53e5fc58ba79fbc34c10ef3df831f"
[[package]]
name = "fnv"
version = "1.0.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
[[package]] [[package]]
name = "futures" name = "futures"
version = "0.3.30" version = "0.3.30"
@ -279,22 +376,31 @@ dependencies = [
] ]
[[package]] [[package]]
name = "gimli" name = "fxhash"
version = "0.28.1" version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4271d37baee1b8c7e4b708028c57d816cf9d2434acb33a549475f78c181f6253" checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c"
dependencies = [
"byteorder",
]
[[package]]
name = "gimli"
version = "0.29.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "40ecd4077b5ae9fd2e9e169b102c6c330d0605168eb0e8bf79952b256dbefffd"
[[package]] [[package]]
name = "hashbrown" name = "hashbrown"
version = "0.14.3" version = "0.14.5"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "290f1a1d9242c78d09ce40a5e87e7554ee637af1351968159f4952f028f75604" checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1"
[[package]] [[package]]
name = "heck" name = "heck"
version = "0.4.1" version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
[[package]] [[package]]
name = "hermit-abi" name = "hermit-abi"
@ -302,6 +408,12 @@ version = "0.3.9"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024" checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024"
[[package]]
name = "ident_case"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39"
[[package]] [[package]]
name = "indicatif" name = "indicatif"
version = "0.17.8" version = "0.17.8"
@ -317,18 +429,39 @@ dependencies = [
[[package]] [[package]]
name = "instant" name = "instant"
version = "0.1.12" version = "0.1.13"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c" checksum = "e0242819d153cba4b4b05a5a8f2a7e9bbf97b6055b2a002b395c96b5ff3c0222"
dependencies = [ dependencies = [
"cfg-if", "cfg-if",
] ]
[[package]] [[package]]
name = "itoa" name = "is_terminal_polyfill"
version = "1.0.10" version = "1.70.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b1a46d1a171d865aa5f83f92695765caa047a9b4cbae2cbf37dbd613a793fd4c" checksum = "f8478577c03552c21db0e2724ffb8986a5ce7af88107e6be5d2ee6e158c12800"
[[package]]
name = "itoa"
version = "1.0.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b"
[[package]]
name = "jieba-rs"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c1e2b0210dc78b49337af9e49d7ae41a39dceac6e5985613f1cf7763e2f76a25"
dependencies = [
"cedarwood",
"derive_builder",
"fxhash",
"lazy_static",
"phf",
"phf_codegen",
"regex",
]
[[package]] [[package]]
name = "lazy_static" name = "lazy_static"
@ -338,15 +471,15 @@ checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
[[package]] [[package]]
name = "libc" name = "libc"
version = "0.2.153" version = "0.2.155"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd" checksum = "97b3888a4aecf77e811145cadf6eef5901f4782c53886191b2f693f24761847c"
[[package]] [[package]]
name = "lock_api" name = "lock_api"
version = "0.4.11" version = "0.4.12"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3c168f8615b12bc01f9c17e2eb0cc07dcae1940121185446edc3744920e8ef45" checksum = "07af8b9cdd281b7915f413fa73f29ebd5d55d0d3f0155584dade1ff18cea1b17"
dependencies = [ dependencies = [
"autocfg", "autocfg",
"scopeguard", "scopeguard",
@ -360,30 +493,19 @@ checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c"
[[package]] [[package]]
name = "memchr" name = "memchr"
version = "2.7.1" version = "2.7.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "523dc4f511e55ab87b694dc30d0f820d60906ef06413f93d4d7a1385599cc149" checksum = "6c8640c5d730cb13ebd907d8d04b52f55ac9a2eec55b440c8892f40d56c76c1d"
[[package]] [[package]]
name = "miniz_oxide" name = "miniz_oxide"
version = "0.7.2" version = "0.7.3"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9d811f3e15f28568be3407c8e7fdb6514c1cda3cb30683f15b6a1a1dc4ea14a7" checksum = "87dfd01fe195c66b572b37921ad8803d010623c0aca821bea2302239d155cdae"
dependencies = [ dependencies = [
"adler", "adler",
] ]
[[package]]
name = "mio"
version = "0.8.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c"
dependencies = [
"libc",
"wasi",
"windows-sys 0.48.0",
]
[[package]] [[package]]
name = "nu-ansi-term" name = "nu-ansi-term"
version = "0.46.0" version = "0.46.0"
@ -412,9 +534,9 @@ checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3"
[[package]] [[package]]
name = "object" name = "object"
version = "0.32.2" version = "0.35.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a6a622008b6e321afc04970976f62ee297fdbaa6f95318ca343e3eebb9648441" checksum = "b8ec7ab813848ba4522158d5517a6093db1ded27575b070f4177b8d12b41db5e"
dependencies = [ dependencies = [
"memchr", "memchr",
] ]
@ -433,9 +555,9 @@ checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
[[package]] [[package]]
name = "parking_lot" name = "parking_lot"
version = "0.12.1" version = "0.12.3"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f" checksum = "f1bf18183cf54e8d6059647fc3063646a1801cf30896933ec2311622cc4b9a27"
dependencies = [ dependencies = [
"lock_api", "lock_api",
"parking_lot_core", "parking_lot_core",
@ -443,22 +565,60 @@ dependencies = [
[[package]] [[package]]
name = "parking_lot_core" name = "parking_lot_core"
version = "0.9.9" version = "0.9.10"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4c42a9226546d68acdd9c0a280d17ce19bfe27a46bf68784e4066115788d008e" checksum = "1e401f977ab385c9e4e3ab30627d6f26d00e2c73eef317493c4ec6d468726cf8"
dependencies = [ dependencies = [
"cfg-if", "cfg-if",
"libc", "libc",
"redox_syscall", "redox_syscall",
"smallvec", "smallvec",
"windows-targets 0.48.5", "windows-targets",
]
[[package]]
name = "phf"
version = "0.11.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ade2d8b8f33c7333b51bcf0428d37e217e9f32192ae4772156f65063b8ce03dc"
dependencies = [
"phf_shared",
]
[[package]]
name = "phf_codegen"
version = "0.11.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e8d39688d359e6b34654d328e262234662d16cc0f60ec8dcbe5e718709342a5a"
dependencies = [
"phf_generator",
"phf_shared",
]
[[package]]
name = "phf_generator"
version = "0.11.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "48e4cc64c2ad9ebe670cb8fd69dd50ae301650392e81c05f9bfcb2d5bdbc24b0"
dependencies = [
"phf_shared",
"rand",
]
[[package]]
name = "phf_shared"
version = "0.11.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "90fcb95eef784c2ac79119d1dd819e162b5da872ce6f3c3abe1e8ca1c082f72b"
dependencies = [
"siphasher",
] ]
[[package]] [[package]]
name = "pin-project-lite" name = "pin-project-lite"
version = "0.2.13" version = "0.2.14"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8afb450f006bf6385ca15ef45d71d2288452bc3683ce2e2cacc0d18e4be60b58" checksum = "bda66fc9667c18cb2758a2ac84d1167245054bcf85d5d1aaa6923f45801bdd02"
[[package]] [[package]]
name = "pin-utils" name = "pin-utils"
@ -474,42 +634,86 @@ checksum = "7170ef9988bc169ba16dd36a7fa041e5c4cbeb6a35b76d4c03daded371eae7c0"
[[package]] [[package]]
name = "proc-macro2" name = "proc-macro2"
version = "1.0.78" version = "1.0.85"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e2422ad645d89c99f8f3e6b88a9fdeca7fabeac836b1002371c4367c8f984aae" checksum = "22244ce15aa966053a896d1accb3a6e68469b97c7f33f284b99f0d576879fc23"
dependencies = [ dependencies = [
"unicode-ident", "unicode-ident",
] ]
[[package]] [[package]]
name = "quote" name = "quote"
version = "1.0.35" version = "1.0.36"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef" checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
] ]
[[package]] [[package]]
name = "redox_syscall" name = "rand"
version = "0.4.1" version = "0.8.5"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa" checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
dependencies = [
"rand_core",
]
[[package]]
name = "rand_core"
version = "0.6.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
[[package]]
name = "redox_syscall"
version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "469052894dcb553421e483e4209ee581a45100d31b4018de03e5a7ad86374a7e"
dependencies = [ dependencies = [
"bitflags", "bitflags",
] ]
[[package]] [[package]]
name = "rustc-demangle" name = "regex"
version = "0.1.23" version = "1.10.5"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d626bb9dae77e28219937af045c257c28bfd3f69333c512553507f5f9798cb76" checksum = "b91213439dad192326a0d7c6ee3955910425f441d7038e0d6933b0aec5c4517f"
dependencies = [
"aho-corasick",
"memchr",
"regex-automata",
"regex-syntax",
]
[[package]]
name = "regex-automata"
version = "0.4.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "38caf58cc5ef2fed281f89292ef23f6365465ed9a41b7a7754eb4e26496c92df"
dependencies = [
"aho-corasick",
"memchr",
"regex-syntax",
]
[[package]]
name = "regex-syntax"
version = "0.8.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7a66a03ae7c801facd77a29370b4faec201768915ac14a721ba36f20bc9c209b"
[[package]]
name = "rustc-demangle"
version = "0.1.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f"
[[package]] [[package]]
name = "ryu" name = "ryu"
version = "1.0.17" version = "1.0.18"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e86697c916019a8588c99b5fac3cead74ec0b4b819707a682fd4d23fa0ce1ba1" checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f"
[[package]] [[package]]
name = "scopeguard" name = "scopeguard"
@ -519,18 +723,18 @@ checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
[[package]] [[package]]
name = "serde" name = "serde"
version = "1.0.197" version = "1.0.203"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3fb1c873e1b9b056a4dc4c0c198b24c3ffa059243875552b2bd0933b1aee4ce2" checksum = "7253ab4de971e72fb7be983802300c30b5a7f0c2e56fab8abfc6a214307c0094"
dependencies = [ dependencies = [
"serde_derive", "serde_derive",
] ]
[[package]] [[package]]
name = "serde_derive" name = "serde_derive"
version = "1.0.197" version = "1.0.203"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7eb0b34b42edc17f6b7cac84a52a1c5f0e1bb2227e997ca9011ea3dd34e8610b" checksum = "500cbc0ebeb6f46627f50f3f5811ccf6bf00643be300b4c3eabc0ef55dc5b5ba"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
@ -539,9 +743,9 @@ dependencies = [
[[package]] [[package]]
name = "serde_json" name = "serde_json"
version = "1.0.114" version = "1.0.117"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c5f09b1bd632ef549eaa9f60a1f8de742bdbc698e6cee2095fc84dde5f549ae0" checksum = "455182ea6142b14f93f4bc5320a2b31c1f266b66a4a5c858b013302a5d8cbfc3"
dependencies = [ dependencies = [
"itoa", "itoa",
"ryu", "ryu",
@ -558,13 +762,10 @@ dependencies = [
] ]
[[package]] [[package]]
name = "signal-hook-registry" name = "siphasher"
version = "1.4.1" version = "0.3.11"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d8229b473baa5980ac72ef434c4415e70c4b5e71b423043adb4ba059f89c99a1" checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d"
dependencies = [
"libc",
]
[[package]] [[package]]
name = "slab" name = "slab"
@ -577,31 +778,21 @@ dependencies = [
[[package]] [[package]]
name = "smallvec" name = "smallvec"
version = "1.13.1" version = "1.13.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e6ecd384b10a64542d77071bd64bd7b231f4ed5940fba55e98c3de13824cf3d7" checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67"
[[package]]
name = "socket2"
version = "0.5.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "05ffd9c0a93b7543e062e759284fcf5f5e3b098501104bfbdde4d404db792871"
dependencies = [
"libc",
"windows-sys 0.52.0",
]
[[package]] [[package]]
name = "strsim" name = "strsim"
version = "0.11.0" version = "0.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5ee073c9e4cd00e28217186dbe12796d692868f432bf2e97ee73bed0c56dfa01" checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
[[package]] [[package]]
name = "syn" name = "syn"
version = "2.0.52" version = "2.0.66"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b699d15b36d1f02c3e7c69f8ffef53de37aefae075d8488d4ba1a7788d574a07" checksum = "c42f3f41a2de00b01c0aaad383c5a45241efc8b2d1eda5661812fda5f3cdcff5"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
@ -620,28 +811,23 @@ dependencies = [
[[package]] [[package]]
name = "tokio" name = "tokio"
version = "1.36.0" version = "1.38.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "61285f6515fa018fb2d1e46eb21223fff441ee8db5d0f1435e8ab4f5cdb80931" checksum = "ba4f4a02a7a80d6f274636f0aa95c7e383b912d41fe721a31f29e29698585a4a"
dependencies = [ dependencies = [
"backtrace", "backtrace",
"bytes", "bytes",
"libc",
"mio",
"num_cpus", "num_cpus",
"parking_lot", "parking_lot",
"pin-project-lite", "pin-project-lite",
"signal-hook-registry",
"socket2",
"tokio-macros", "tokio-macros",
"windows-sys 0.48.0",
] ]
[[package]] [[package]]
name = "tokio-macros" name = "tokio-macros"
version = "2.2.0" version = "2.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b" checksum = "5f5ae998a069d4b5aba8ee9dad856af7d520c3699e6159b185c2acd48155d39a"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
@ -713,15 +899,15 @@ checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b"
[[package]] [[package]]
name = "unicode-width" name = "unicode-width"
version = "0.1.11" version = "0.1.13"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e51733f11c9c4f72aa0c160008246859e340b00807569a0da0e7a1079b27ba85" checksum = "0336d538f7abc86d282a4189614dfaa90810dfc2c6f6427eaf88e16311dd225d"
[[package]] [[package]]
name = "utf8parse" name = "utf8parse"
version = "0.2.1" version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a" checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
[[package]] [[package]]
name = "valuable" name = "valuable"
@ -729,12 +915,6 @@ version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d" checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d"
[[package]]
name = "wasi"
version = "0.11.0+wasi-snapshot-preview1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
[[package]] [[package]]
name = "winapi" name = "winapi"
version = "0.3.9" version = "0.3.9"
@ -757,137 +937,78 @@ version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
[[package]]
name = "windows-sys"
version = "0.48.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9"
dependencies = [
"windows-targets 0.48.5",
]
[[package]] [[package]]
name = "windows-sys" name = "windows-sys"
version = "0.52.0" version = "0.52.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
dependencies = [ dependencies = [
"windows-targets 0.52.4", "windows-targets",
] ]
[[package]] [[package]]
name = "windows-targets" name = "windows-targets"
version = "0.48.5" version = "0.52.5"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" checksum = "6f0713a46559409d202e70e28227288446bf7841d3211583a4b53e3f6d96e7eb"
dependencies = [ dependencies = [
"windows_aarch64_gnullvm 0.48.5", "windows_aarch64_gnullvm",
"windows_aarch64_msvc 0.48.5", "windows_aarch64_msvc",
"windows_i686_gnu 0.48.5", "windows_i686_gnu",
"windows_i686_msvc 0.48.5", "windows_i686_gnullvm",
"windows_x86_64_gnu 0.48.5", "windows_i686_msvc",
"windows_x86_64_gnullvm 0.48.5", "windows_x86_64_gnu",
"windows_x86_64_msvc 0.48.5", "windows_x86_64_gnullvm",
] "windows_x86_64_msvc",
[[package]]
name = "windows-targets"
version = "0.52.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7dd37b7e5ab9018759f893a1952c9420d060016fc19a472b4bb20d1bdd694d1b"
dependencies = [
"windows_aarch64_gnullvm 0.52.4",
"windows_aarch64_msvc 0.52.4",
"windows_i686_gnu 0.52.4",
"windows_i686_msvc 0.52.4",
"windows_x86_64_gnu 0.52.4",
"windows_x86_64_gnullvm 0.52.4",
"windows_x86_64_msvc 0.52.4",
] ]
[[package]] [[package]]
name = "windows_aarch64_gnullvm" name = "windows_aarch64_gnullvm"
version = "0.48.5" version = "0.52.5"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" checksum = "7088eed71e8b8dda258ecc8bac5fb1153c5cffaf2578fc8ff5d61e23578d3263"
[[package]]
name = "windows_aarch64_gnullvm"
version = "0.52.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bcf46cf4c365c6f2d1cc93ce535f2c8b244591df96ceee75d8e83deb70a9cac9"
[[package]] [[package]]
name = "windows_aarch64_msvc" name = "windows_aarch64_msvc"
version = "0.48.5" version = "0.52.5"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" checksum = "9985fd1504e250c615ca5f281c3f7a6da76213ebd5ccc9561496568a2752afb6"
[[package]]
name = "windows_aarch64_msvc"
version = "0.52.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "da9f259dd3bcf6990b55bffd094c4f7235817ba4ceebde8e6d11cd0c5633b675"
[[package]] [[package]]
name = "windows_i686_gnu" name = "windows_i686_gnu"
version = "0.48.5" version = "0.52.5"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" checksum = "88ba073cf16d5372720ec942a8ccbf61626074c6d4dd2e745299726ce8b89670"
[[package]] [[package]]
name = "windows_i686_gnu" name = "windows_i686_gnullvm"
version = "0.52.4" version = "0.52.5"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b474d8268f99e0995f25b9f095bc7434632601028cf86590aea5c8a5cb7801d3" checksum = "87f4261229030a858f36b459e748ae97545d6f1ec60e5e0d6a3d32e0dc232ee9"
[[package]] [[package]]
name = "windows_i686_msvc" name = "windows_i686_msvc"
version = "0.48.5" version = "0.52.5"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" checksum = "db3c2bf3d13d5b658be73463284eaf12830ac9a26a90c717b7f771dfe97487bf"
[[package]]
name = "windows_i686_msvc"
version = "0.52.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1515e9a29e5bed743cb4415a9ecf5dfca648ce85ee42e15873c3cd8610ff8e02"
[[package]] [[package]]
name = "windows_x86_64_gnu" name = "windows_x86_64_gnu"
version = "0.48.5" version = "0.52.5"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" checksum = "4e4246f76bdeff09eb48875a0fd3e2af6aada79d409d33011886d3e1581517d9"
[[package]]
name = "windows_x86_64_gnu"
version = "0.52.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5eee091590e89cc02ad514ffe3ead9eb6b660aedca2183455434b93546371a03"
[[package]] [[package]]
name = "windows_x86_64_gnullvm" name = "windows_x86_64_gnullvm"
version = "0.48.5" version = "0.52.5"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" checksum = "852298e482cd67c356ddd9570386e2862b5673c85bd5f88df9ab6802b334c596"
[[package]]
name = "windows_x86_64_gnullvm"
version = "0.52.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "77ca79f2451b49fa9e2af39f0747fe999fcda4f5e241b2898624dca97a1f2177"
[[package]] [[package]]
name = "windows_x86_64_msvc" name = "windows_x86_64_msvc"
version = "0.48.5" version = "0.52.5"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" checksum = "bec47e5bfd1bff0eeaf6d8b485cc1074891a197ab4225d504cb7a1ab88b02bf0"
[[package]]
name = "windows_x86_64_msvc"
version = "0.52.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8"
[[package]] [[package]]
name = "word_freq_analyzer" name = "word_freq_analyzer"
@ -897,6 +1018,8 @@ dependencies = [
"dashmap", "dashmap",
"futures", "futures",
"indicatif", "indicatif",
"jieba-rs",
"once_cell",
"serde_json", "serde_json",
"tokio", "tokio",
"tracing", "tracing",

View File

@ -6,12 +6,23 @@ edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies] [dependencies]
clap = { version = "4.5.2", features = ["derive"] } clap = { version = "4.5.7", features = ["derive"] }
dashmap = { version = "5.5.3", features = ["serde"] } dashmap = { version = "5.5.3", features = ["serde"] }
futures = "0.3.30" futures = "0.3.30"
indicatif = "0.17.8" indicatif = "0.17.8"
serde_json = "1.0.114" jieba-rs = { version = "0.7.0", default-features = false }
tokio = { version = "1.36.0", features = ["full"] } once_cell = "1.19.0"
serde_json = "1.0.117"
tokio = { version = "1.38.0", features = [
"rt",
"rt-multi-thread",
"io-util",
"time",
"macros",
"sync",
"fs",
"parking_lot",
] }
tracing = "0.1.40" tracing = "0.1.40"
tracing-subscriber = "0.3.18" tracing-subscriber = "0.3.18"

584429
dict.txt.big Normal file

File diff suppressed because it is too large Load Diff

27
keywords/1.txt Normal file
View File

@ -0,0 +1,27 @@
数字经济
数字媒体
数字化
数字资产
数字化转型
数据管理
数据挖掘
数据网络
数据平台
数据中心
数据科学
数字控制
数字技术
数字通信
数字网络
数字智能
数字终端
数字营销
大数据
云计算
云IT
云生态
云服务
云平台
区块链
物联网
机器学习

21
keywords/2.txt Normal file
View File

@ -0,0 +1,21 @@
R&D
研发
创新
研究
开发
研制
科研
预研
设计
创造
实验
试验
技术
专利
工艺
新项目
新产品
新业务
知识产权
科技成果
科技投入

View File

@ -1,8 +1,9 @@
use std::cell::RefCell; use std::cell::RefCell;
use std::collections::{BTreeMap, HashMap, VecDeque}; use std::collections::{BTreeMap, HashMap, VecDeque};
use std::env::current_dir;
use std::fmt::Write; use std::fmt::Write;
use std::io; use std::fs::read_to_string;
use std::os::unix::prelude::*; use std::io::{self, Cursor};
use std::path::PathBuf; use std::path::PathBuf;
use std::rc::{Rc, Weak}; use std::rc::{Rc, Weak};
use std::sync::{ use std::sync::{
@ -14,64 +15,54 @@ use std::time::Duration;
use clap::Parser; use clap::Parser;
use dashmap::DashMap; use dashmap::DashMap;
use indicatif::{ProgressBar, ProgressState, ProgressStyle}; use indicatif::{ProgressBar, ProgressState, ProgressStyle};
use jieba_rs::Jieba;
use once_cell::{sync::Lazy, unsync::OnceCell};
use tokio::{fs, sync::Semaphore, time::sleep}; use tokio::{fs, sync::Semaphore, time::sleep};
use tracing::Instrument; use tracing::Instrument;
static KEYWORDS: &[&str] = &[
"数字经济",
"数字媒体",
"数字化",
"数字资产",
"数字化转型",
"数据管理",
"数据挖掘",
"数据网络",
"数据平台",
"数据中心",
"数据科学",
"数字控制",
"数字技术",
"数字通信",
"数字网络",
"数字智能",
"数字终端",
"数字营销",
"大数据",
"云计算",
"云IT",
"云生态",
"云服务",
"云平台",
"区块链",
"物联网",
"机器学习",
];
thread_local! { static ANALYZER: AhoCorasick = AhoCorasick::new(KEYWORDS); }
#[derive(Parser, Debug)] #[derive(Parser, Debug)]
#[command(version, about, long_about = None)] #[command(version, about, long_about = None)]
struct Args { struct Args {
#[arg(short, long)] #[arg(short, long, default_value_os_t = current_dir().unwrap())]
directory: PathBuf, directory: PathBuf,
#[arg(short, long, default_value = "16")] #[arg(short, long, default_value_t = 16)]
jobs: usize, jobs: usize,
#[arg(short = 'f', long, default_value_os_t = current_dir().unwrap().join("keywords.txt"))]
keywords_file: PathBuf,
#[arg(last = true)]
keywords: Vec<String>,
}
static ARGS: Lazy<Args> = Lazy::new(|| {
let mut args = Args::parse();
if args.keywords.is_empty() && args.keywords_file.exists() {
let content = read_to_string(&args.keywords_file).expect("failed to read keywords file");
args.keywords = content.split_whitespace().map(|s| s.to_string()).collect();
}
args
});
static SEGMENTATION: Lazy<Jieba> = Lazy::new(|| {
const DICT_CONTENT: &[u8] = include_bytes!("../dict.txt.big");
Jieba::with_dict(&mut Cursor::new(DICT_CONTENT)).expect("failed to open jieba with dict")
});
thread_local! {
static ANALYZER: OnceCell<AhoCorasick> = const { OnceCell::new() };
} }
#[tokio::main] #[tokio::main]
async fn main() -> io::Result<()> { async fn main() -> io::Result<()> {
tracing_subscriber::fmt::init(); tracing_subscriber::fmt::init();
let args = Args::parse();
tracing::debug!( tracing::debug!(
"start to analyze {} with {} jobs", "start to analyze {} with {} jobs",
args.directory.display(), ARGS.directory.display(),
args.jobs ARGS.jobs
); );
tracing::debug!("current keywords are {:?}", &ARGS.keywords);
let all_tasks = { let all_tasks = {
let mut res = Vec::new(); let mut res = Vec::new();
let mut files = fs::read_dir(&args.directory).await?; let mut files = fs::read_dir(&ARGS.directory).await?;
while let Some(entry) = files.next_entry().await? { while let Some(entry) = files.next_entry().await? {
if entry.path().extension().map_or(true, |ext| ext != "txt") { if entry.path().extension().map_or(true, |ext| ext != "txt") {
continue; continue;
@ -83,7 +74,7 @@ async fn main() -> io::Result<()> {
let metadata = entry.metadata().await?; let metadata = entry.metadata().await?;
let task = Task { let task = Task {
file: entry.path(), file: entry.path(),
size: metadata.size() as usize, size: metadata.len() as usize,
}; };
res.push(task); res.push(task);
} }
@ -91,7 +82,7 @@ async fn main() -> io::Result<()> {
}; };
let analysis = Arc::new(DashMap::new()); let analysis = Arc::new(DashMap::new());
let permits = Arc::new(Semaphore::new(args.jobs)); let permits = Arc::new(Semaphore::new(ARGS.jobs));
let total_size: usize = all_tasks.iter().map(|t| t.size).sum(); let total_size: usize = all_tasks.iter().map(|t| t.size).sum();
let analyzed_size = Arc::new(AtomicUsize::new(0)); let analyzed_size = Arc::new(AtomicUsize::new(0));
tracing::info!("A total of {} bytes of text to analyze", total_size); tracing::info!("A total of {} bytes of text to analyze", total_size);
@ -112,7 +103,14 @@ async fn main() -> io::Result<()> {
let content = String::from_utf8_lossy(&buf); let content = String::from_utf8_lossy(&buf);
tracing::debug!("Start to analyze"); tracing::debug!("Start to analyze");
let result = ANALYZER.with(|analyzer| analyzer.analyze(&content)); let words_cnt = SEGMENTATION.cut(&content, true).len();
let result = ANALYZER.with(|analyzer| {
let mut res = analyzer
.get_or_init(|| AhoCorasick::new(&ARGS.keywords))
.analyze(&content);
res.insert("_total", words_cnt);
res
});
for (word, count) in result.iter() { for (word, count) in result.iter() {
tracing::trace!(word = %word, count = %count, "Analyzed"); tracing::trace!(word = %word, count = %count, "Analyzed");
analysis analysis
@ -120,13 +118,13 @@ async fn main() -> io::Result<()> {
.and_modify(|e| *e += count) .and_modify(|e| *e += count)
.or_insert(*count); .or_insert(*count);
} }
tracing::debug!("Finished analysis");
tracing::info!("Write result to file");
let json_result = serde_json::to_vec(&result).unwrap(); let json_result = serde_json::to_vec(&result).unwrap();
fs::write(task.file.with_extension("json"), json_result).await?; fs::write(task.file.with_extension("json"), json_result).await?;
tracing::info!("Write result to file");
analyzed_size.fetch_add(len, Ordering::Release); analyzed_size.fetch_add(len, Ordering::Release);
tracing::debug!("Finished analysis");
io::Result::Ok(()) io::Result::Ok(())
} }
@ -184,9 +182,9 @@ pub struct AhoCorasick {
} }
impl AhoCorasick { impl AhoCorasick {
pub fn new(words: &[&str]) -> Self { pub fn new<S: AsRef<str>>(words: &[S]) -> Self {
let root = Rc::new(RefCell::new(ACNode::default())); let root = Rc::new(RefCell::new(ACNode::default()));
for word in words { for word in words.iter().map(|s| s.as_ref()) {
let mut cur = Rc::clone(&root); let mut cur = Rc::clone(&root);
for c in word.chars() { for c in word.chars() {
cur = Rc::clone(Rc::clone(&cur).borrow_mut().trans.entry(c).or_default()); cur = Rc::clone(Rc::clone(&cur).borrow_mut().trans.entry(c).or_default());