chromedp 爬取王者荣耀所有壁纸

216 阅读1分钟
package main

import (
   "bytes"
   "context"
   "errors"
   "fmt"
   "io"
   "log"
   "net/http"
   "os"
   "sync"
   "time"
   "unsafe"

   "github.com/antchfx/htmlquery"
   "github.com/chromedp/chromedp"
)

var wg sync.WaitGroup

func main() {
   htmls = make(chan string, 33)
   url := "https://pvp.qq.com/web201605/wallpaper.shtml"
   sel := `#Work_List_Container_267733`
   three := `document.querySelector("body")`

   wg.Add(1)
   go GetHttpHtmlContent(url, sel, three)
   for i := 0; i < 33; i++ {
      data, ok := <-htmls
      fmt.Println(i, ok)
      wg.Add(1)
      go GetSpecialImages(data)
   }
   wg.Wait()
}

func toBytes(s string) []byte {
   return *(*[]byte)(unsafe.Pointer(&s))
}

var htmls chan string
// xapth 获取url
func GetSpecialImages(htmlContent string) error {
   body := toBytes(htmlContent)
   doc, err := htmlquery.Parse(bytes.NewReader(body))
   if err != nil {
      fmt.Println(err)
   }

   urls := []string{}
   nodes := htmlquery.Find(doc, `//*[@id="Work_List_Container_267733"]/div/ul/li[7]/a/@href`)
   for _, node := range nodes {
      fmt.Println("fetch ", node.FirstChild.Data)
      urls = append(urls, node.FirstChild.Data)
   }

   names := []string{}
   nodes1 := htmlquery.Find(doc, `//*[@id="Work_List_Container_267733"]/div/h4/a`)
   for _, node := range nodes1 {
      fmt.Println("fetch ", node.FirstChild.Data)
      names = append(names, node.FirstChild.Data)
   }

   for i, v := range urls {
      name := names[i]
      fmt.Println(v, "->: ", name)
      httpDownLoad(v, name)
   }
   defer wg.Done()
   return nil
}

// 获取html
func GetHttpHtmlContent(url string, selector string, sel interface{}) {
   options := []chromedp.ExecAllocatorOption{
      chromedp.Flag("headless", false),
      chromedp.Flag("blink-settings", "imagesEnabled=true"),
      chromedp.UserAgent(`Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36`),
   }
   options = append(chromedp.DefaultExecAllocatorOptions[:], options...)

   c, _ := chromedp.NewExecAllocator(context.Background(), options...)

   // create context
   chromeCtx, cancel := chromedp.NewContext(c, chromedp.WithLogf(log.Printf))

   chromedp.Run(chromeCtx, make([]chromedp.Action, 0, 1)...)

   timeoutCtx, cancel := context.WithTimeout(chromeCtx, 120*time.Second)
   defer cancel()

   var htmlContent string
   err := chromedp.Run(timeoutCtx,
      chromedp.Navigate(url),
      chromedp.WaitVisible(selector),
      chromedp.OuterHTML(sel, &htmlContent, chromedp.ByJSPath),
   )
   if err != nil {
      fmt.Println(err)
   }
   htmls <- htmlContent
   for i := 0; i < 32; i++ {
      htmlContent = ""
      err := chromedp.Run(timeoutCtx,
         chromedp.Click(`#Page_Container_267733 > a.downpage`),
         chromedp.OuterHTML(sel, &htmlContent, chromedp.ByJSPath),
      )
      if err != nil {
         panic(errors.New("get html failed"))
      }
      htmls <- htmlContent
      fmt.Println(len(htmls), i)
      time.Sleep(time.Second)
   }
   close(htmls)
   defer wg.Done()
}

// 下载图片
func httpDownLoad(url string, name string) error {
   v, err := http.Get(url)
   if err != nil {
      fmt.Printf("Http get [%v] failed! %v", url, err)
      return err
   }
   defer v.Body.Close()
   content, err := io.ReadAll(v.Body)
   if err != nil {
      fmt.Printf("Read http response failed! %v", err)
      return err
   }
   filename := "./images/" + name + ".jpg"
   err = os.WriteFile(filename, content, 0666)
   if err != nil {
      fmt.Printf("Save to file failed! %v", err)
      return err
   }
   return nil
}