Initial commit

This commit is contained in:
hr567 2021-01-21 17:25:38 +08:00
commit 050138a9fb
No known key found for this signature in database
GPG Key ID: B93B0188202CB302
4 changed files with 1509 additions and 0 deletions

2
.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
.vscode/
target/

1415
Cargo.lock generated Normal file

File diff suppressed because it is too large Load Diff

12
Cargo.toml Normal file
View File

@ -0,0 +1,12 @@
[package]
name = "quote-scrape"
version = "0.1.0"
authors = ["hr567 <hr567@hr567.me>"]
edition = "2018"
[dependencies]
lazy_static = "1.4.0"
reqwest = "0.11.0"
scraper = "0.12.0"
tokio = { version = "1.0.2", features = [ "rt-multi-thread", "sync" ] }
url = "2.2.0"

80
src/main.rs Normal file
View File

@ -0,0 +1,80 @@
use std::sync::Arc;
use lazy_static::lazy_static;
use reqwest::Client;
use scraper::{Html, Selector};
use tokio::{
runtime::Runtime,
sync::{mpsc, Semaphore},
};
use url::Url;
const MAX_TASK: usize = 16;
lazy_static! {
static ref URL: Url = Url::parse("https://quotes.toscrape.com/").unwrap();
static ref CLIENT: Client = {
use reqwest::header::{HeaderMap, HeaderValue, USER_AGENT};
let mut headers = HeaderMap::new();
let user_agent = HeaderValue::from_static(
r"Mozilla/5.0 (X11; Linux x86_64; rv:84.0) Gecko/20100101 Firefox/84.0",
);
headers.insert(USER_AGENT, user_agent);
Client::builder().default_headers(headers).build().unwrap()
};
}
#[derive(Debug)]
struct Quote {
text: String,
author: String,
tags: Vec<String>,
}
async fn download_quote_html(idx: usize) -> reqwest::Result<String> {
let page_url = URL.join(&format!("page/{}/", idx)).unwrap();
let res = CLIENT.get(page_url).send().await?;
let html = res.text().await?;
Ok(html)
}
fn parse_quote_html(page: Html) -> Vec<Quote> {
lazy_static! {
static ref QUOTE: Selector = Selector::parse(r#".quote"#).unwrap();
static ref TEXT: Selector = Selector::parse(r#".text"#).unwrap();
static ref AUTHOR: Selector = Selector::parse(r#".author"#).unwrap();
static ref TAG: Selector = Selector::parse(r#".tag"#).unwrap();
}
page.select(&QUOTE)
.map(|quote| Quote {
text: quote.select(&TEXT).next().unwrap().inner_html(),
author: quote.select(&AUTHOR).next().unwrap().inner_html(),
tags: quote.select(&TAG).map(|e| e.inner_html()).collect(),
})
.collect()
}
fn main() {
let rt = Runtime::new().unwrap();
let pool = Arc::new(Semaphore::new(MAX_TASK));
let (tx, mut rx) = mpsc::unbounded_channel::<Quote>();
for page in 1..20 {
let pool = Arc::clone(&pool);
let tx = tx.clone();
rt.spawn(async move {
let _permit = pool.acquire().await.unwrap();
let text = download_quote_html(page).await.unwrap();
let html = Html::parse_document(&text);
let quotes = parse_quote_html(html);
for quote in quotes.into_iter() {
tx.send(quote).unwrap();
}
});
}
drop(tx);
while let Some(quote) = rx.blocking_recv() {
println!("{:?}", quote);
}
}