Initial commit
This commit is contained in:
commit
050138a9fb
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
.vscode/
|
||||||
|
target/
|
1415
Cargo.lock
generated
Normal file
1415
Cargo.lock
generated
Normal file
File diff suppressed because it is too large
Load Diff
12
Cargo.toml
Normal file
12
Cargo.toml
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
[package]
|
||||||
|
name = "quote-scrape"
|
||||||
|
version = "0.1.0"
|
||||||
|
authors = ["hr567 <hr567@hr567.me>"]
|
||||||
|
edition = "2018"
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
lazy_static = "1.4.0"
|
||||||
|
reqwest = "0.11.0"
|
||||||
|
scraper = "0.12.0"
|
||||||
|
tokio = { version = "1.0.2", features = [ "rt-multi-thread", "sync" ] }
|
||||||
|
url = "2.2.0"
|
80
src/main.rs
Normal file
80
src/main.rs
Normal file
@ -0,0 +1,80 @@
|
|||||||
|
use std::sync::Arc;
|
||||||
|
|
||||||
|
use lazy_static::lazy_static;
|
||||||
|
use reqwest::Client;
|
||||||
|
use scraper::{Html, Selector};
|
||||||
|
use tokio::{
|
||||||
|
runtime::Runtime,
|
||||||
|
sync::{mpsc, Semaphore},
|
||||||
|
};
|
||||||
|
use url::Url;
|
||||||
|
|
||||||
|
const MAX_TASK: usize = 16;
|
||||||
|
|
||||||
|
lazy_static! {
|
||||||
|
static ref URL: Url = Url::parse("https://quotes.toscrape.com/").unwrap();
|
||||||
|
static ref CLIENT: Client = {
|
||||||
|
use reqwest::header::{HeaderMap, HeaderValue, USER_AGENT};
|
||||||
|
let mut headers = HeaderMap::new();
|
||||||
|
let user_agent = HeaderValue::from_static(
|
||||||
|
r"Mozilla/5.0 (X11; Linux x86_64; rv:84.0) Gecko/20100101 Firefox/84.0",
|
||||||
|
);
|
||||||
|
headers.insert(USER_AGENT, user_agent);
|
||||||
|
Client::builder().default_headers(headers).build().unwrap()
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
|
struct Quote {
|
||||||
|
text: String,
|
||||||
|
author: String,
|
||||||
|
tags: Vec<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn download_quote_html(idx: usize) -> reqwest::Result<String> {
|
||||||
|
let page_url = URL.join(&format!("page/{}/", idx)).unwrap();
|
||||||
|
let res = CLIENT.get(page_url).send().await?;
|
||||||
|
let html = res.text().await?;
|
||||||
|
Ok(html)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn parse_quote_html(page: Html) -> Vec<Quote> {
|
||||||
|
lazy_static! {
|
||||||
|
static ref QUOTE: Selector = Selector::parse(r#".quote"#).unwrap();
|
||||||
|
static ref TEXT: Selector = Selector::parse(r#".text"#).unwrap();
|
||||||
|
static ref AUTHOR: Selector = Selector::parse(r#".author"#).unwrap();
|
||||||
|
static ref TAG: Selector = Selector::parse(r#".tag"#).unwrap();
|
||||||
|
}
|
||||||
|
page.select("E)
|
||||||
|
.map(|quote| Quote {
|
||||||
|
text: quote.select(&TEXT).next().unwrap().inner_html(),
|
||||||
|
author: quote.select(&AUTHOR).next().unwrap().inner_html(),
|
||||||
|
tags: quote.select(&TAG).map(|e| e.inner_html()).collect(),
|
||||||
|
})
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn main() {
|
||||||
|
let rt = Runtime::new().unwrap();
|
||||||
|
let pool = Arc::new(Semaphore::new(MAX_TASK));
|
||||||
|
let (tx, mut rx) = mpsc::unbounded_channel::<Quote>();
|
||||||
|
|
||||||
|
for page in 1..20 {
|
||||||
|
let pool = Arc::clone(&pool);
|
||||||
|
let tx = tx.clone();
|
||||||
|
rt.spawn(async move {
|
||||||
|
let _permit = pool.acquire().await.unwrap();
|
||||||
|
let text = download_quote_html(page).await.unwrap();
|
||||||
|
let html = Html::parse_document(&text);
|
||||||
|
let quotes = parse_quote_html(html);
|
||||||
|
for quote in quotes.into_iter() {
|
||||||
|
tx.send(quote).unwrap();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
drop(tx);
|
||||||
|
|
||||||
|
while let Some(quote) = rx.blocking_recv() {
|
||||||
|
println!("{:?}", quote);
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user