Initial commit
This commit is contained in:
commit
050138a9fb
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
||||
.vscode/
|
||||
target/
|
1415
Cargo.lock
generated
Normal file
1415
Cargo.lock
generated
Normal file
File diff suppressed because it is too large
Load Diff
12
Cargo.toml
Normal file
12
Cargo.toml
Normal file
@ -0,0 +1,12 @@
|
||||
[package]
|
||||
name = "quote-scrape"
|
||||
version = "0.1.0"
|
||||
authors = ["hr567 <hr567@hr567.me>"]
|
||||
edition = "2018"
|
||||
|
||||
[dependencies]
|
||||
lazy_static = "1.4.0"
|
||||
reqwest = "0.11.0"
|
||||
scraper = "0.12.0"
|
||||
tokio = { version = "1.0.2", features = [ "rt-multi-thread", "sync" ] }
|
||||
url = "2.2.0"
|
80
src/main.rs
Normal file
80
src/main.rs
Normal file
@ -0,0 +1,80 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use lazy_static::lazy_static;
|
||||
use reqwest::Client;
|
||||
use scraper::{Html, Selector};
|
||||
use tokio::{
|
||||
runtime::Runtime,
|
||||
sync::{mpsc, Semaphore},
|
||||
};
|
||||
use url::Url;
|
||||
|
||||
const MAX_TASK: usize = 16;
|
||||
|
||||
lazy_static! {
|
||||
static ref URL: Url = Url::parse("https://quotes.toscrape.com/").unwrap();
|
||||
static ref CLIENT: Client = {
|
||||
use reqwest::header::{HeaderMap, HeaderValue, USER_AGENT};
|
||||
let mut headers = HeaderMap::new();
|
||||
let user_agent = HeaderValue::from_static(
|
||||
r"Mozilla/5.0 (X11; Linux x86_64; rv:84.0) Gecko/20100101 Firefox/84.0",
|
||||
);
|
||||
headers.insert(USER_AGENT, user_agent);
|
||||
Client::builder().default_headers(headers).build().unwrap()
|
||||
};
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct Quote {
|
||||
text: String,
|
||||
author: String,
|
||||
tags: Vec<String>,
|
||||
}
|
||||
|
||||
async fn download_quote_html(idx: usize) -> reqwest::Result<String> {
|
||||
let page_url = URL.join(&format!("page/{}/", idx)).unwrap();
|
||||
let res = CLIENT.get(page_url).send().await?;
|
||||
let html = res.text().await?;
|
||||
Ok(html)
|
||||
}
|
||||
|
||||
fn parse_quote_html(page: Html) -> Vec<Quote> {
|
||||
lazy_static! {
|
||||
static ref QUOTE: Selector = Selector::parse(r#".quote"#).unwrap();
|
||||
static ref TEXT: Selector = Selector::parse(r#".text"#).unwrap();
|
||||
static ref AUTHOR: Selector = Selector::parse(r#".author"#).unwrap();
|
||||
static ref TAG: Selector = Selector::parse(r#".tag"#).unwrap();
|
||||
}
|
||||
page.select("E)
|
||||
.map(|quote| Quote {
|
||||
text: quote.select(&TEXT).next().unwrap().inner_html(),
|
||||
author: quote.select(&AUTHOR).next().unwrap().inner_html(),
|
||||
tags: quote.select(&TAG).map(|e| e.inner_html()).collect(),
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn main() {
|
||||
let rt = Runtime::new().unwrap();
|
||||
let pool = Arc::new(Semaphore::new(MAX_TASK));
|
||||
let (tx, mut rx) = mpsc::unbounded_channel::<Quote>();
|
||||
|
||||
for page in 1..20 {
|
||||
let pool = Arc::clone(&pool);
|
||||
let tx = tx.clone();
|
||||
rt.spawn(async move {
|
||||
let _permit = pool.acquire().await.unwrap();
|
||||
let text = download_quote_html(page).await.unwrap();
|
||||
let html = Html::parse_document(&text);
|
||||
let quotes = parse_quote_html(html);
|
||||
for quote in quotes.into_iter() {
|
||||
tx.send(quote).unwrap();
|
||||
}
|
||||
});
|
||||
}
|
||||
drop(tx);
|
||||
|
||||
while let Some(quote) = rx.blocking_recv() {
|
||||
println!("{:?}", quote);
|
||||
}
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user