Compare commits

..

2 Commits

Author SHA1 Message Date
hr567
4d8990d37f
Use DFA for better performance
Signed-off-by: hr567 <hr567@linux.alibaba.com>
2024-03-08 16:36:41 +08:00
4e1206a1bb
Use aho-corasick crate
Signed-off-by: hr567 <hr567@hr567.me>
2024-03-08 11:47:00 +08:00
6 changed files with 288 additions and 584957 deletions

552
Cargo.lock generated
View File

@ -4,9 +4,9 @@ version = 3
[[package]]
name = "addr2line"
version = "0.22.0"
version = "0.21.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6e4503c46a5c0c7844e948c9a4d6acd9f50cccb4de1c48eb9e291ea17470c678"
checksum = "8a30b2e23b9e17a9f90641c7ab1549cd9b44f296d3ccbf309d2863cfe398a0cb"
dependencies = [
"gimli",
]
@ -19,73 +19,72 @@ checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
[[package]]
name = "aho-corasick"
version = "1.1.3"
version = "1.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916"
checksum = "b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0"
dependencies = [
"memchr",
]
[[package]]
name = "anstream"
version = "0.6.14"
version = "0.6.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "418c75fa768af9c03be99d17643f93f79bbba589895012a80e3452a19ddda15b"
checksum = "d96bd03f33fe50a863e394ee9718a706f988b9079b20c3784fb726e7678b62fb"
dependencies = [
"anstyle",
"anstyle-parse",
"anstyle-query",
"anstyle-wincon",
"colorchoice",
"is_terminal_polyfill",
"utf8parse",
]
[[package]]
name = "anstyle"
version = "1.0.7"
version = "1.0.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "038dfcf04a5feb68e9c60b21c9625a54c2c0616e79b72b0fd87075a056ae1d1b"
checksum = "8901269c6307e8d93993578286ac0edf7f195079ffff5ebdeea6a59ffb7e36bc"
[[package]]
name = "anstyle-parse"
version = "0.2.4"
version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c03a11a9034d92058ceb6ee011ce58af4a9bf61491aa7e1e59ecd24bd40d22d4"
checksum = "c75ac65da39e5fe5ab759307499ddad880d724eed2f6ce5b5e8a26f4f387928c"
dependencies = [
"utf8parse",
]
[[package]]
name = "anstyle-query"
version = "1.1.0"
version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ad186efb764318d35165f1758e7dcef3b10628e26d41a44bc5550652e6804391"
checksum = "e28923312444cdd728e4738b3f9c9cac739500909bb3d3c94b43551b16517648"
dependencies = [
"windows-sys",
"windows-sys 0.52.0",
]
[[package]]
name = "anstyle-wincon"
version = "3.0.3"
version = "3.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "61a38449feb7068f52bb06c12759005cf459ee52bb4adc1d5a7c4322d716fb19"
checksum = "1cd54b81ec8d6180e24654d0b371ad22fc3dd083b6ff8ba325b72e00c87660a7"
dependencies = [
"anstyle",
"windows-sys",
"windows-sys 0.52.0",
]
[[package]]
name = "autocfg"
version = "1.3.0"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0c4b4d0bd25bd0b74681c0ad21497610ce1b7c91b1022cd21c80c6fbdd9476b0"
checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
[[package]]
name = "backtrace"
version = "0.3.72"
version = "0.3.69"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "17c6a35df3749d2e8bb1b7b21a976d82b15548788d2735b9d82f329268f71a11"
checksum = "2089b7e3f35b9dd2d0ed921ead4f6d318c27680d4a5bd167b3ee120edb105837"
dependencies = [
"addr2line",
"cc",
@ -98,36 +97,21 @@ dependencies = [
[[package]]
name = "bitflags"
version = "2.5.0"
version = "1.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cf4b9d6a944f767f8e5e0db018570623c85f3d925ac718db4e06d0187adb21c1"
[[package]]
name = "byteorder"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
[[package]]
name = "bytes"
version = "1.6.0"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "514de17de45fdb8dc022b1a7975556c53c86f9f0aa5f534b98977b171857c2c9"
checksum = "a2bd12c1caf447e69cd4528f47f94d203fd2582878ecb9e9465484c4148a8223"
[[package]]
name = "cc"
version = "1.0.99"
version = "1.0.90"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "96c51067fd44124faa7f870b4b1c969379ad32b2ba805aa959430ceaa384f695"
[[package]]
name = "cedarwood"
version = "0.4.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6d910bedd62c24733263d0bed247460853c9d22e8956bd4cd964302095e04e90"
dependencies = [
"smallvec",
]
checksum = "8cd6604a82acf3039f1144f54b8eb34e91ffba622051189e71b781822d5ee1f5"
[[package]]
name = "cfg-if"
@ -137,9 +121,9 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]]
name = "clap"
version = "4.5.7"
version = "4.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5db83dced34638ad474f39f250d7fea9598bdd239eaced1bdf45d597da0f433f"
checksum = "b230ab84b0ffdf890d5a10abdbc8b83ae1c4918275daea1ab8801f71536b2651"
dependencies = [
"clap_builder",
"clap_derive",
@ -147,9 +131,9 @@ dependencies = [
[[package]]
name = "clap_builder"
version = "4.5.7"
version = "4.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f7e204572485eb3fbf28f871612191521df159bc3e15a9f5064c66dba3a8c05f"
checksum = "ae129e2e766ae0ec03484e609954119f123cc1fe650337e155d03b022f24f7b4"
dependencies = [
"anstream",
"anstyle",
@ -159,9 +143,9 @@ dependencies = [
[[package]]
name = "clap_derive"
version = "4.5.5"
version = "4.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c780290ccf4fb26629baa7a1081e68ced113f1d3ec302fa5948f1c381ebf06c6"
checksum = "307bc0538d5f0f83b8248db3087aa92fe504e4691294d0c96c0eabc33f47ba47"
dependencies = [
"heck",
"proc-macro2",
@ -171,15 +155,15 @@ dependencies = [
[[package]]
name = "clap_lex"
version = "0.7.1"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4b82cf0babdbd58558212896d1a4272303a57bdb245c2bf1147185fb45640e70"
checksum = "98cc8fbded0c607b7ba9dd60cd98df59af97e84d24e49c8557331cfc26d301ce"
[[package]]
name = "colorchoice"
version = "1.0.1"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0b6a852b24ab71dffc585bcb46eaf7959d175cb865a7152e35b348d1b2960422"
checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7"
[[package]]
name = "console"
@ -191,42 +175,7 @@ dependencies = [
"lazy_static",
"libc",
"unicode-width",
"windows-sys",
]
[[package]]
name = "darling"
version = "0.20.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "83b2eb4d90d12bdda5ed17de686c2acb4c57914f8f921b8da7e112b5a36f3fe1"
dependencies = [
"darling_core",
"darling_macro",
]
[[package]]
name = "darling_core"
version = "0.20.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "622687fe0bac72a04e5599029151f5796111b90f1baaa9b544d807a5e31cd120"
dependencies = [
"fnv",
"ident_case",
"proc-macro2",
"quote",
"strsim",
"syn",
]
[[package]]
name = "darling_macro"
version = "0.20.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "733cabb43482b1a1b53eee8583c2b9e8684d592215ea83efd305dd31bc2f0178"
dependencies = [
"darling_core",
"quote",
"syn",
"windows-sys 0.52.0",
]
[[package]]
@ -243,49 +192,12 @@ dependencies = [
"serde",
]
[[package]]
name = "derive_builder"
version = "0.20.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0350b5cb0331628a5916d6c5c0b72e97393b8b6b03b47a9284f4e7f5a405ffd7"
dependencies = [
"derive_builder_macro",
]
[[package]]
name = "derive_builder_core"
version = "0.20.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d48cda787f839151732d396ac69e3473923d54312c070ee21e9effcaa8ca0b1d"
dependencies = [
"darling",
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "derive_builder_macro"
version = "0.20.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "206868b8242f27cecce124c19fd88157fbd0dd334df2587f36417bafbc85097b"
dependencies = [
"derive_builder_core",
"syn",
]
[[package]]
name = "encode_unicode"
version = "0.3.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a357d28ed41a50f9c765dbfe56cbc04a64e53e5fc58ba79fbc34c10ef3df831f"
[[package]]
name = "fnv"
version = "1.0.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
[[package]]
name = "futures"
version = "0.3.30"
@ -375,32 +287,23 @@ dependencies = [
"slab",
]
[[package]]
name = "fxhash"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c"
dependencies = [
"byteorder",
]
[[package]]
name = "gimli"
version = "0.29.0"
version = "0.28.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "40ecd4077b5ae9fd2e9e169b102c6c330d0605168eb0e8bf79952b256dbefffd"
checksum = "4271d37baee1b8c7e4b708028c57d816cf9d2434acb33a549475f78c181f6253"
[[package]]
name = "hashbrown"
version = "0.14.5"
version = "0.14.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1"
checksum = "290f1a1d9242c78d09ce40a5e87e7554ee637af1351968159f4952f028f75604"
[[package]]
name = "heck"
version = "0.5.0"
version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
[[package]]
name = "hermit-abi"
@ -408,12 +311,6 @@ version = "0.3.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024"
[[package]]
name = "ident_case"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39"
[[package]]
name = "indicatif"
version = "0.17.8"
@ -429,39 +326,18 @@ dependencies = [
[[package]]
name = "instant"
version = "0.1.13"
version = "0.1.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e0242819d153cba4b4b05a5a8f2a7e9bbf97b6055b2a002b395c96b5ff3c0222"
checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c"
dependencies = [
"cfg-if",
]
[[package]]
name = "is_terminal_polyfill"
version = "1.70.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f8478577c03552c21db0e2724ffb8986a5ce7af88107e6be5d2ee6e158c12800"
[[package]]
name = "itoa"
version = "1.0.11"
version = "1.0.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b"
[[package]]
name = "jieba-rs"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c1e2b0210dc78b49337af9e49d7ae41a39dceac6e5985613f1cf7763e2f76a25"
dependencies = [
"cedarwood",
"derive_builder",
"fxhash",
"lazy_static",
"phf",
"phf_codegen",
"regex",
]
checksum = "b1a46d1a171d865aa5f83f92695765caa047a9b4cbae2cbf37dbd613a793fd4c"
[[package]]
name = "lazy_static"
@ -471,15 +347,15 @@ checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
[[package]]
name = "libc"
version = "0.2.155"
version = "0.2.153"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "97b3888a4aecf77e811145cadf6eef5901f4782c53886191b2f693f24761847c"
checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd"
[[package]]
name = "lock_api"
version = "0.4.12"
version = "0.4.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "07af8b9cdd281b7915f413fa73f29ebd5d55d0d3f0155584dade1ff18cea1b17"
checksum = "3c168f8615b12bc01f9c17e2eb0cc07dcae1940121185446edc3744920e8ef45"
dependencies = [
"autocfg",
"scopeguard",
@ -493,19 +369,30 @@ checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c"
[[package]]
name = "memchr"
version = "2.7.2"
version = "2.7.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6c8640c5d730cb13ebd907d8d04b52f55ac9a2eec55b440c8892f40d56c76c1d"
checksum = "523dc4f511e55ab87b694dc30d0f820d60906ef06413f93d4d7a1385599cc149"
[[package]]
name = "miniz_oxide"
version = "0.7.3"
version = "0.7.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "87dfd01fe195c66b572b37921ad8803d010623c0aca821bea2302239d155cdae"
checksum = "9d811f3e15f28568be3407c8e7fdb6514c1cda3cb30683f15b6a1a1dc4ea14a7"
dependencies = [
"adler",
]
[[package]]
name = "mio"
version = "0.8.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c"
dependencies = [
"libc",
"wasi",
"windows-sys 0.48.0",
]
[[package]]
name = "nu-ansi-term"
version = "0.46.0"
@ -534,9 +421,9 @@ checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3"
[[package]]
name = "object"
version = "0.35.0"
version = "0.32.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b8ec7ab813848ba4522158d5517a6093db1ded27575b070f4177b8d12b41db5e"
checksum = "a6a622008b6e321afc04970976f62ee297fdbaa6f95318ca343e3eebb9648441"
dependencies = [
"memchr",
]
@ -555,9 +442,9 @@ checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
[[package]]
name = "parking_lot"
version = "0.12.3"
version = "0.12.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f1bf18183cf54e8d6059647fc3063646a1801cf30896933ec2311622cc4b9a27"
checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f"
dependencies = [
"lock_api",
"parking_lot_core",
@ -565,60 +452,22 @@ dependencies = [
[[package]]
name = "parking_lot_core"
version = "0.9.10"
version = "0.9.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1e401f977ab385c9e4e3ab30627d6f26d00e2c73eef317493c4ec6d468726cf8"
checksum = "4c42a9226546d68acdd9c0a280d17ce19bfe27a46bf68784e4066115788d008e"
dependencies = [
"cfg-if",
"libc",
"redox_syscall",
"smallvec",
"windows-targets",
]
[[package]]
name = "phf"
version = "0.11.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ade2d8b8f33c7333b51bcf0428d37e217e9f32192ae4772156f65063b8ce03dc"
dependencies = [
"phf_shared",
]
[[package]]
name = "phf_codegen"
version = "0.11.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e8d39688d359e6b34654d328e262234662d16cc0f60ec8dcbe5e718709342a5a"
dependencies = [
"phf_generator",
"phf_shared",
]
[[package]]
name = "phf_generator"
version = "0.11.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "48e4cc64c2ad9ebe670cb8fd69dd50ae301650392e81c05f9bfcb2d5bdbc24b0"
dependencies = [
"phf_shared",
"rand",
]
[[package]]
name = "phf_shared"
version = "0.11.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "90fcb95eef784c2ac79119d1dd819e162b5da872ce6f3c3abe1e8ca1c082f72b"
dependencies = [
"siphasher",
"windows-targets 0.48.5",
]
[[package]]
name = "pin-project-lite"
version = "0.2.14"
version = "0.2.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bda66fc9667c18cb2758a2ac84d1167245054bcf85d5d1aaa6923f45801bdd02"
checksum = "8afb450f006bf6385ca15ef45d71d2288452bc3683ce2e2cacc0d18e4be60b58"
[[package]]
name = "pin-utils"
@ -634,86 +483,42 @@ checksum = "7170ef9988bc169ba16dd36a7fa041e5c4cbeb6a35b76d4c03daded371eae7c0"
[[package]]
name = "proc-macro2"
version = "1.0.85"
version = "1.0.78"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "22244ce15aa966053a896d1accb3a6e68469b97c7f33f284b99f0d576879fc23"
checksum = "e2422ad645d89c99f8f3e6b88a9fdeca7fabeac836b1002371c4367c8f984aae"
dependencies = [
"unicode-ident",
]
[[package]]
name = "quote"
version = "1.0.36"
version = "1.0.35"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7"
checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef"
dependencies = [
"proc-macro2",
]
[[package]]
name = "rand"
version = "0.8.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
dependencies = [
"rand_core",
]
[[package]]
name = "rand_core"
version = "0.6.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
[[package]]
name = "redox_syscall"
version = "0.5.1"
version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "469052894dcb553421e483e4209ee581a45100d31b4018de03e5a7ad86374a7e"
checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa"
dependencies = [
"bitflags",
]
[[package]]
name = "regex"
version = "1.10.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b91213439dad192326a0d7c6ee3955910425f441d7038e0d6933b0aec5c4517f"
dependencies = [
"aho-corasick",
"memchr",
"regex-automata",
"regex-syntax",
]
[[package]]
name = "regex-automata"
version = "0.4.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "38caf58cc5ef2fed281f89292ef23f6365465ed9a41b7a7754eb4e26496c92df"
dependencies = [
"aho-corasick",
"memchr",
"regex-syntax",
]
[[package]]
name = "regex-syntax"
version = "0.8.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7a66a03ae7c801facd77a29370b4faec201768915ac14a721ba36f20bc9c209b"
[[package]]
name = "rustc-demangle"
version = "0.1.24"
version = "0.1.23"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f"
checksum = "d626bb9dae77e28219937af045c257c28bfd3f69333c512553507f5f9798cb76"
[[package]]
name = "ryu"
version = "1.0.18"
version = "1.0.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f"
checksum = "e86697c916019a8588c99b5fac3cead74ec0b4b819707a682fd4d23fa0ce1ba1"
[[package]]
name = "scopeguard"
@ -723,18 +528,18 @@ checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
[[package]]
name = "serde"
version = "1.0.203"
version = "1.0.197"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7253ab4de971e72fb7be983802300c30b5a7f0c2e56fab8abfc6a214307c0094"
checksum = "3fb1c873e1b9b056a4dc4c0c198b24c3ffa059243875552b2bd0933b1aee4ce2"
dependencies = [
"serde_derive",
]
[[package]]
name = "serde_derive"
version = "1.0.203"
version = "1.0.197"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "500cbc0ebeb6f46627f50f3f5811ccf6bf00643be300b4c3eabc0ef55dc5b5ba"
checksum = "7eb0b34b42edc17f6b7cac84a52a1c5f0e1bb2227e997ca9011ea3dd34e8610b"
dependencies = [
"proc-macro2",
"quote",
@ -743,9 +548,9 @@ dependencies = [
[[package]]
name = "serde_json"
version = "1.0.117"
version = "1.0.114"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "455182ea6142b14f93f4bc5320a2b31c1f266b66a4a5c858b013302a5d8cbfc3"
checksum = "c5f09b1bd632ef549eaa9f60a1f8de742bdbc698e6cee2095fc84dde5f549ae0"
dependencies = [
"itoa",
"ryu",
@ -762,10 +567,13 @@ dependencies = [
]
[[package]]
name = "siphasher"
version = "0.3.11"
name = "signal-hook-registry"
version = "1.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d"
checksum = "d8229b473baa5980ac72ef434c4415e70c4b5e71b423043adb4ba059f89c99a1"
dependencies = [
"libc",
]
[[package]]
name = "slab"
@ -778,21 +586,31 @@ dependencies = [
[[package]]
name = "smallvec"
version = "1.13.2"
version = "1.13.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67"
checksum = "e6ecd384b10a64542d77071bd64bd7b231f4ed5940fba55e98c3de13824cf3d7"
[[package]]
name = "socket2"
version = "0.5.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "05ffd9c0a93b7543e062e759284fcf5f5e3b098501104bfbdde4d404db792871"
dependencies = [
"libc",
"windows-sys 0.52.0",
]
[[package]]
name = "strsim"
version = "0.11.1"
version = "0.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
checksum = "5ee073c9e4cd00e28217186dbe12796d692868f432bf2e97ee73bed0c56dfa01"
[[package]]
name = "syn"
version = "2.0.66"
version = "2.0.52"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c42f3f41a2de00b01c0aaad383c5a45241efc8b2d1eda5661812fda5f3cdcff5"
checksum = "b699d15b36d1f02c3e7c69f8ffef53de37aefae075d8488d4ba1a7788d574a07"
dependencies = [
"proc-macro2",
"quote",
@ -811,23 +629,28 @@ dependencies = [
[[package]]
name = "tokio"
version = "1.38.0"
version = "1.36.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ba4f4a02a7a80d6f274636f0aa95c7e383b912d41fe721a31f29e29698585a4a"
checksum = "61285f6515fa018fb2d1e46eb21223fff441ee8db5d0f1435e8ab4f5cdb80931"
dependencies = [
"backtrace",
"bytes",
"libc",
"mio",
"num_cpus",
"parking_lot",
"pin-project-lite",
"signal-hook-registry",
"socket2",
"tokio-macros",
"windows-sys 0.48.0",
]
[[package]]
name = "tokio-macros"
version = "2.3.0"
version = "2.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5f5ae998a069d4b5aba8ee9dad856af7d520c3699e6159b185c2acd48155d39a"
checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b"
dependencies = [
"proc-macro2",
"quote",
@ -899,15 +722,15 @@ checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b"
[[package]]
name = "unicode-width"
version = "0.1.13"
version = "0.1.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0336d538f7abc86d282a4189614dfaa90810dfc2c6f6427eaf88e16311dd225d"
checksum = "e51733f11c9c4f72aa0c160008246859e340b00807569a0da0e7a1079b27ba85"
[[package]]
name = "utf8parse"
version = "0.2.2"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a"
[[package]]
name = "valuable"
@ -915,6 +738,12 @@ version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d"
[[package]]
name = "wasi"
version = "0.11.0+wasi-snapshot-preview1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
[[package]]
name = "winapi"
version = "0.3.9"
@ -937,88 +766,147 @@ version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
[[package]]
name = "windows-sys"
version = "0.48.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9"
dependencies = [
"windows-targets 0.48.5",
]
[[package]]
name = "windows-sys"
version = "0.52.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
dependencies = [
"windows-targets",
"windows-targets 0.52.4",
]
[[package]]
name = "windows-targets"
version = "0.52.5"
version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6f0713a46559409d202e70e28227288446bf7841d3211583a4b53e3f6d96e7eb"
checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c"
dependencies = [
"windows_aarch64_gnullvm",
"windows_aarch64_msvc",
"windows_i686_gnu",
"windows_i686_gnullvm",
"windows_i686_msvc",
"windows_x86_64_gnu",
"windows_x86_64_gnullvm",
"windows_x86_64_msvc",
"windows_aarch64_gnullvm 0.48.5",
"windows_aarch64_msvc 0.48.5",
"windows_i686_gnu 0.48.5",
"windows_i686_msvc 0.48.5",
"windows_x86_64_gnu 0.48.5",
"windows_x86_64_gnullvm 0.48.5",
"windows_x86_64_msvc 0.48.5",
]
[[package]]
name = "windows-targets"
version = "0.52.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7dd37b7e5ab9018759f893a1952c9420d060016fc19a472b4bb20d1bdd694d1b"
dependencies = [
"windows_aarch64_gnullvm 0.52.4",
"windows_aarch64_msvc 0.52.4",
"windows_i686_gnu 0.52.4",
"windows_i686_msvc 0.52.4",
"windows_x86_64_gnu 0.52.4",
"windows_x86_64_gnullvm 0.52.4",
"windows_x86_64_msvc 0.52.4",
]
[[package]]
name = "windows_aarch64_gnullvm"
version = "0.52.5"
version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7088eed71e8b8dda258ecc8bac5fb1153c5cffaf2578fc8ff5d61e23578d3263"
checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8"
[[package]]
name = "windows_aarch64_gnullvm"
version = "0.52.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bcf46cf4c365c6f2d1cc93ce535f2c8b244591df96ceee75d8e83deb70a9cac9"
[[package]]
name = "windows_aarch64_msvc"
version = "0.52.5"
version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9985fd1504e250c615ca5f281c3f7a6da76213ebd5ccc9561496568a2752afb6"
checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc"
[[package]]
name = "windows_aarch64_msvc"
version = "0.52.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "da9f259dd3bcf6990b55bffd094c4f7235817ba4ceebde8e6d11cd0c5633b675"
[[package]]
name = "windows_i686_gnu"
version = "0.52.5"
version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "88ba073cf16d5372720ec942a8ccbf61626074c6d4dd2e745299726ce8b89670"
checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e"
[[package]]
name = "windows_i686_gnullvm"
version = "0.52.5"
name = "windows_i686_gnu"
version = "0.52.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "87f4261229030a858f36b459e748ae97545d6f1ec60e5e0d6a3d32e0dc232ee9"
checksum = "b474d8268f99e0995f25b9f095bc7434632601028cf86590aea5c8a5cb7801d3"
[[package]]
name = "windows_i686_msvc"
version = "0.52.5"
version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "db3c2bf3d13d5b658be73463284eaf12830ac9a26a90c717b7f771dfe97487bf"
checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406"
[[package]]
name = "windows_i686_msvc"
version = "0.52.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1515e9a29e5bed743cb4415a9ecf5dfca648ce85ee42e15873c3cd8610ff8e02"
[[package]]
name = "windows_x86_64_gnu"
version = "0.52.5"
version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4e4246f76bdeff09eb48875a0fd3e2af6aada79d409d33011886d3e1581517d9"
checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e"
[[package]]
name = "windows_x86_64_gnu"
version = "0.52.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5eee091590e89cc02ad514ffe3ead9eb6b660aedca2183455434b93546371a03"
[[package]]
name = "windows_x86_64_gnullvm"
version = "0.52.5"
version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "852298e482cd67c356ddd9570386e2862b5673c85bd5f88df9ab6802b334c596"
checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc"
[[package]]
name = "windows_x86_64_gnullvm"
version = "0.52.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "77ca79f2451b49fa9e2af39f0747fe999fcda4f5e241b2898624dca97a1f2177"
[[package]]
name = "windows_x86_64_msvc"
version = "0.52.5"
version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bec47e5bfd1bff0eeaf6d8b485cc1074891a197ab4225d504cb7a1ab88b02bf0"
checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538"
[[package]]
name = "windows_x86_64_msvc"
version = "0.52.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8"
[[package]]
name = "word_freq_analyzer"
version = "0.1.0"
dependencies = [
"aho-corasick",
"clap",
"dashmap",
"futures",
"indicatif",
"jieba-rs",
"once_cell",
"serde_json",
"tokio",

View File

@ -6,23 +6,14 @@ edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
clap = { version = "4.5.7", features = ["derive"] }
aho-corasick = "1.1.2"
clap = { version = "4.5.2", features = ["derive"] }
dashmap = { version = "5.5.3", features = ["serde"] }
futures = "0.3.30"
indicatif = "0.17.8"
jieba-rs = { version = "0.7.0", default-features = false }
once_cell = "1.19.0"
serde_json = "1.0.117"
tokio = { version = "1.38.0", features = [
"rt",
"rt-multi-thread",
"io-util",
"time",
"macros",
"sync",
"fs",
"parking_lot",
] }
serde_json = "1.0.114"
tokio = { version = "1.36.0", features = ["full"] }
tracing = "0.1.40"
tracing-subscriber = "0.3.18"

584429
dict.txt.big

File diff suppressed because it is too large Load Diff

View File

@ -1,27 +0,0 @@
数字经济
数字媒体
数字化
数字资产
数字化转型
数据管理
数据挖掘
数据网络
数据平台
数据中心
数据科学
数字控制
数字技术
数字通信
数字网络
数字智能
数字终端
数字营销
大数据
云计算
云IT
云生态
云服务
云平台
区块链
物联网
机器学习

View File

@ -1,21 +0,0 @@
R&D
研发
创新
研究
开发
研制
科研
预研
设计
创造
实验
试验
技术
专利
工艺
新项目
新产品
新业务
知识产权
科技成果
科技投入

View File

@ -1,68 +1,84 @@
use std::cell::RefCell;
use std::collections::{BTreeMap, HashMap, VecDeque};
use std::env::current_dir;
use std::collections::HashMap;
use std::fmt::Write;
use std::fs::read_to_string;
use std::io::{self, Cursor};
use std::io;
use std::os::unix::prelude::*;
use std::path::PathBuf;
use std::rc::{Rc, Weak};
use std::str;
use std::sync::{
atomic::{AtomicBool, AtomicUsize, Ordering},
Arc,
};
use std::time::Duration;
use aho_corasick::{AhoCorasick, AhoCorasickKind, MatchKind};
use clap::Parser;
use dashmap::DashMap;
use indicatif::{ProgressBar, ProgressState, ProgressStyle};
use jieba_rs::Jieba;
use once_cell::{sync::Lazy, unsync::OnceCell};
use once_cell::sync::Lazy;
use tokio::{fs, sync::Semaphore, time::sleep};
use tracing::Instrument;
static KEYWORDS: &[&str] = &[
"数字经济",
"数字媒体",
"数字化",
"数字资产",
"数字化转型",
"数据管理",
"数据挖掘",
"数据网络",
"数据平台",
"数据中心",
"数据科学",
"数字控制",
"数字技术",
"数字通信",
"数字网络",
"数字智能",
"数字终端",
"数字营销",
"大数据",
"云计算",
"云IT",
"云生态",
"云服务",
"云平台",
"区块链",
"物联网",
"机器学习",
];
static ANALYZER: Lazy<AhoCorasick> = Lazy::new(|| {
AhoCorasick::builder()
.kind(Some(AhoCorasickKind::DFA))
.match_kind(MatchKind::Standard)
.prefilter(true)
.build(KEYWORDS.iter())
.unwrap()
});
#[derive(Parser, Debug)]
#[command(version, about, long_about = None)]
struct Args {
#[arg(short, long, default_value_os_t = current_dir().unwrap())]
#[arg(short, long)]
directory: PathBuf,
#[arg(short, long, default_value_t = 16)]
#[arg(short, long, default_value = "16")]
jobs: usize,
#[arg(short = 'f', long, default_value_os_t = current_dir().unwrap().join("keywords.txt"))]
keywords_file: PathBuf,
#[arg(last = true)]
keywords: Vec<String>,
}
static ARGS: Lazy<Args> = Lazy::new(|| {
let mut args = Args::parse();
if args.keywords.is_empty() && args.keywords_file.exists() {
let content = read_to_string(&args.keywords_file).expect("failed to read keywords file");
args.keywords = content.split_whitespace().map(|s| s.to_string()).collect();
}
args
});
static SEGMENTATION: Lazy<Jieba> = Lazy::new(|| {
const DICT_CONTENT: &[u8] = include_bytes!("../dict.txt.big");
Jieba::with_dict(&mut Cursor::new(DICT_CONTENT)).expect("failed to open jieba with dict")
});
thread_local! {
static ANALYZER: OnceCell<AhoCorasick> = const { OnceCell::new() };
}
#[tokio::main]
async fn main() -> io::Result<()> {
tracing_subscriber::fmt::init();
let args = Args::parse();
tracing::debug!(
"start to analyze {} with {} jobs",
ARGS.directory.display(),
ARGS.jobs
args.directory.display(),
args.jobs
);
tracing::debug!("current keywords are {:?}", &ARGS.keywords);
let all_tasks = {
let mut res = Vec::new();
let mut files = fs::read_dir(&ARGS.directory).await?;
let mut files = fs::read_dir(&args.directory).await?;
while let Some(entry) = files.next_entry().await? {
if entry.path().extension().map_or(true, |ext| ext != "txt") {
continue;
@ -74,7 +90,7 @@ async fn main() -> io::Result<()> {
let metadata = entry.metadata().await?;
let task = Task {
file: entry.path(),
size: metadata.len() as usize,
size: metadata.size() as usize,
};
res.push(task);
}
@ -82,7 +98,7 @@ async fn main() -> io::Result<()> {
};
let analysis = Arc::new(DashMap::new());
let permits = Arc::new(Semaphore::new(ARGS.jobs));
let permits = Arc::new(Semaphore::new(args.jobs));
let total_size: usize = all_tasks.iter().map(|t| t.size).sum();
let analyzed_size = Arc::new(AtomicUsize::new(0));
tracing::info!("A total of {} bytes of text to analyze", total_size);
@ -100,31 +116,28 @@ async fn main() -> io::Result<()> {
tracing::info!("Start to read file");
let buf = fs::read(&task.file).await?;
let len = buf.len();
let content = String::from_utf8_lossy(&buf);
tracing::debug!("Start to analyze");
let words_cnt = SEGMENTATION.cut(&content, true).len();
let result = ANALYZER.with(|analyzer| {
let mut res = analyzer
.get_or_init(|| AhoCorasick::new(&ARGS.keywords))
.analyze(&content);
res.insert("_total", words_cnt);
res
});
for (word, count) in result.iter() {
tracing::trace!(word = %word, count = %count, "Analyzed");
let mut result = HashMap::new();
for mat in ANALYZER.find_iter(&buf) {
let word = str::from_utf8(&buf[mat.range()]).unwrap();
tracing::trace!(word = %word, "Matched");
result
.entry(word.to_string())
.and_modify(|e| *e += 1)
.or_insert(1);
analysis
.entry(word.to_string())
.and_modify(|e| *e += count)
.or_insert(*count);
.and_modify(|e| *e += 1)
.or_insert(1);
}
tracing::debug!("Finished analysis");
tracing::info!("Write result to file");
let json_result = serde_json::to_vec(&result).unwrap();
fs::write(task.file.with_extension("json"), json_result).await?;
tracing::info!("Write result to file");
analyzed_size.fetch_add(len, Ordering::Release);
tracing::debug!("Finished analysis");
io::Result::Ok(())
}
@ -175,87 +188,3 @@ struct Task {
file: PathBuf,
size: usize,
}
#[derive(Default)]
pub struct AhoCorasick {
root: Rc<RefCell<ACNode>>,
}
impl AhoCorasick {
pub fn new<S: AsRef<str>>(words: &[S]) -> Self {
let root = Rc::new(RefCell::new(ACNode::default()));
for word in words.iter().map(|s| s.as_ref()) {
let mut cur = Rc::clone(&root);
for c in word.chars() {
cur = Rc::clone(Rc::clone(&cur).borrow_mut().trans.entry(c).or_default());
}
cur.borrow_mut().lengths.push(word.len());
}
Self::build_suffix(Rc::clone(&root));
Self { root }
}
fn build_suffix(root: Rc<RefCell<ACNode>>) {
let mut q = VecDeque::new();
q.push_back(Rc::clone(&root));
while let Some(parent) = q.pop_front() {
let parent = parent.borrow();
for (c, child) in &parent.trans {
q.push_back(Rc::clone(child));
let mut child = child.borrow_mut();
let mut suffix = parent.suffix.upgrade();
loop {
match &suffix {
None => {
child.lengths.extend(root.borrow().lengths.clone());
child.suffix = Rc::downgrade(&root);
break;
}
Some(node) => {
if node.borrow().trans.contains_key(c) {
let node = &node.borrow().trans[c];
child.lengths.extend(node.borrow().lengths.clone());
child.suffix = Rc::downgrade(node);
break;
}
suffix = suffix.unwrap().borrow().suffix.upgrade();
}
}
}
}
}
}
pub fn analyze<'a>(&self, s: &'a str) -> HashMap<&'a str, usize> {
let mut ans = HashMap::new();
let mut cur = Rc::clone(&self.root);
let mut position: usize = 0;
for c in s.chars() {
loop {
if let Some(child) = Rc::clone(&cur).borrow().trans.get(&c) {
cur = Rc::clone(child);
break;
}
let suffix = cur.borrow().suffix.clone();
match suffix.upgrade() {
Some(node) => cur = node,
None => break,
}
}
position += c.len_utf8();
for &len in &cur.borrow().lengths {
ans.entry(&s[position - len..position])
.and_modify(|e| *e += 1)
.or_insert(1);
}
}
ans
}
}
#[derive(Default)]
struct ACNode {
trans: BTreeMap<char, Rc<RefCell<ACNode>>>,
suffix: Weak<RefCell<ACNode>>,
lengths: Vec<usize>,
}