Rust爬取第三方必应网站图片地址、名称、日期数据 (2)

155 阅读2分钟

截屏2023-11-24 20.51.54.png

继续爬取相关数据 这次我们爬取 必应壁纸的图片地址、名称、日期信息等

use chrono::NaiveDate;
use lazy_static::lazy_static;
use reqwest::{Client, StatusCode};
use htmler::Selector;

#[tokio::main]
async fn main() {

}

lazy_static!{
    /// bing 必应图片第三方网站
    static ref BING_URL: String =  "https://peapix.com/bing/cn".to_string();

    static ref BING_IMG_LIST_SELECTOR: Selector = htmler::Selector::parse(r#"div[class="col-md-6 col-lg-4"]"#).unwrap();
    static ref BING_IMG_ROW_SELECTOR: Selector = htmler::Selector::parse(r#"div[class="image-list__container"]"#).unwrap();
    static ref BING_PIC_SELECTOR: Selector = htmler::Selector::parse(r#"div[class="image-list__picture lazyload"]"#).unwrap();
    static ref BING_DESC_SELECTOR: Selector = htmler::Selector::parse(r#"a[class="image-list__link"]"#).unwrap();
    static ref BING_DATE_SELECTOR: Selector = htmler::Selector::parse(r#"span[class="text-gray"]"#).unwrap();
    static ref BING_PAGE_NUMBERS_SELECTOR: Selector = htmler::Selector::parse(r#"a[class="page-link"]"#).unwrap();
}


/// `get_bing_total_page` 获取bing网页中总页数
///
/// # Examples
///
/// ```
/// let total = get_bing_dom(1);
///
/// assert_eq!(6, total);
/// ```
async fn get_bing_total_page() -> Option<i32>{
    let client = Client::new();
    if let Ok(res) = client.get(BING_URL.clone()).send().await{
        let html_dom = res.text().await.unwrap();
        let html = htmler::Html::parse_fragment(&html_dom);
        if let Some(node) = html.select(&BING_PAGE_NUMBERS_SELECTOR).last(){
            return Some(node.inner_html().parse::<i32>().unwrap());
        }
    }
    None
}

/// 必应壁纸模型
#[derive(Debug, Clone)]
struct BingWallpaperModal{
    /// 图片名称
    name: String,
    /// 必应每日壁纸加入日期
    add_date: NaiveDate,
    /// 标清图片
    img_url: String,
    /// 2k图片
    uhd_img_url: String
}

/// `get_bing_page` 获取必应每页的数据
///
/// # Examples
///
/// ```
/// let bing_vec = get_bing_page();
///
/// assert!(bing_vec.unwrap().len() > 0);
/// ``
async fn get_bing_page(current_page: i32)->Option<Vec<BingWallpaperModal>> {
    let client = Client::new();

    if let Ok(res) = client.get(BING_URL.clone() + "?page="+&current_page.to_string()).send().await {
        if res.status() == StatusCode::OK {
            let mut bing_wallpaper_vec = vec![];
            let data = res.text().await.unwrap();
            let html = htmler::Html::parse_fragment(&data);

            let x = html.select(&BING_IMG_LIST_SELECTOR);

            for img_list in x {
                if let Some(node) = img_list.clone().select(&BING_IMG_ROW_SELECTOR).next() {
                    let img_url = node.select(&BING_PIC_SELECTOR).next().unwrap().get_attribute("data-bgset").replace("480.jpg", "240.jpg");
                    let desc = node.select(&BING_DESC_SELECTOR).next().unwrap().get_attribute("title");
                    let date = node.select(&BING_DATE_SELECTOR).next().unwrap().inner_html();
                    bing_wallpaper_vec.push(BingWallpaperModal {
                        name: desc.to_string(),
                        img_url: img_url.clone(),
                        uhd_img_url: img_url.replace("240.jpg", "2560.jpg"),
                        add_date: NaiveDate::parse_from_str(&date, "%B %d, %Y").unwrap(),
                    });
                }
            }
            return Some(bing_wallpaper_vec);
        }
    }
    None
}

#[cfg(test)]
mod tests{
    use crate::{get_bing_page, get_bing_total_page};

    #[actix_rt::test]
    async fn get_bing_total_page_test(){
        let total = get_bing_total_page().await;
        assert_eq!(Some(50), total)
    }

    #[actix_rt::test]
    async fn get_bing_page_test(){
        let bing_wallpaper_vec_opt = get_bing_page(1).await;
        assert!(bing_wallpaper_vec_opt.is_some());
        println!("{:#?}", bing_wallpaper_vec_opt.clone().unwrap());
        assert!(bing_wallpaper_vec_opt.clone().unwrap().len()>0);
    }
}

截屏2023-11-24 20.59.47.png

截屏2023-11-24 20.54.46.png