go 初次尝试 colly

360 阅读3分钟

持续创作,加速成长!这是我参与「掘金日新计划 · 10 月更文挑战」的第2天。

wallerphoto.jpg gocolly是go语言编写的比较出名的爬虫库,初次尝试,个人觉得colly的api简介明了,配合goquery功能强大。

安装

go-colly官网,官网目前我打开速度比较慢(正常上网情况下),有时候打不开。使用go mod的话直接输入下面的命令.目前我使用的go版本是1.19

$ mkdir colly && cd colly
$ go mod init colly
$ go get -u github.com/gocolly/colly/...
$ go mod tidy

colly/main.go

主要尝试一下colly,所以全部的代码就都写在main文件里了,爬虫库使用了colly;另外还使用了一个觉得不错的excel操作库excelizeexcelize官方文档,文档上面写的很详细,需要使用excel可以看看🙃。直接开始上代码抓取豆瓣电影 top25排行榜,你也可以爬top250😂 开始之前,先打开网页看一下豆瓣的html结构

截屏2022-10-13 21.47.31.png

从网页源代码上我们可以看出需要抓取的信息在 .item这个类选择器下,我们主要提取电影标题、豆瓣地址、豆瓣评分和是否可播放这几个信息,后续想要提取更多的可以自己扩展。 先定义一个结构体存储我们需要的信息

type Hot struct {
	Movie_name string `selector:"div.item>div.info>div.hd>a>span:nth-child(1)"`
	Href       string `selector:"div.item>div.info>div.hd>a[href]"`
	Rating     string `selector:"div.item>div.info>div.bd>div.star>span.rating_num"`
	Playable   string `selector:"div.item>div.info>div.hd>span.playable"`
}

创建colly collector去请求网页,等待响应后处理返回的响应结果

c := colly.NewCollector(
    colly.AllowedDomains("movie.douban.com"),
)

收到响应后处理,因为返回的是html格式,所以调用HTML回调处理

c.OnResponse(func(r *colly.Response) {
    fmt.Println(r.StatusCode)
})
c.OnHTML("ol.grid_view>li", func(h *colly.HTMLElement) {
        h.DOM.Find("div.item>div.info").Each(func(i int, s *goquery.Selection) {

        title := s.Find("div.hd>a")
        href, _ := title.Attr("href")

        movie_name := title.Find("span.title").Text()
        playable := s.Find("div.hd>span.playable").Text()
        rating := s.Find("div.bd>div.star>span.rating_num").Text()
        fmt.Println("电影名:", movie_name, "评分:", rating, "地址:", href, playable)
    })
})

请求开始

c.Visit("https://movie.douban.com/top250")
fmt.Println("host length:", len(hots))

把结果存储到excel

截屏2022-10-13 22.02.43.png

func WriteFile() *excelize.File {
    f := excelize.NewFile()
    index := f.NewSheet("Sheet1")
    f.SetSheetRow("Sheet1", "A1", &[]any{"电影名", "评分", "地址", "是否可观看"})
    f.SetRowHeight("Sheet1", 1, 30)       //设置行高度
    f.SetColWidth("Sheet1", "A", "A", 40) //设置列宽度
    f.SetColWidth("Sheet1", "C", "C", 40)
    f.SetColWidth("Sheet1", "D", "D", 12)
    setTitleStyle(f)
    f.SetActiveSheet(index)
    if err := f.SaveAs("films.xlsx"); err != nil {
            log.Fatalln("err", err)
            panic(err)
    }
    return f
}
// 设置标题栏样式
func setTitleStyle(f *excelize.File) {
    A1, err := f.NewStyle(&excelize.Style{
            Alignment: &excelize.Alignment{Vertical: "center"}, //垂直居中
            Font:      &excelize.Font{Family: "黑体", Size: 14},
    })
    if err != nil {
            fmt.Println("set style error:", err)
    }
    B1, _ := f.NewStyle(&excelize.Style{
            Alignment: &excelize.Alignment{Vertical: "center", Horizontal: "center"}, //水平垂直居中
            Font:      &excelize.Font{Family: "黑体", Size: 14},
    })
    col, _ := f.NewStyle(&excelize.Style{
            Alignment: &excelize.Alignment{Vertical: "center", Horizontal: "center"}, //水平垂直居中
    })
    f.SetCellStyle("Sheet1", "A1", "A1", A1)
    f.SetCellStyle("Sheet1", "C1", "C1", A1)
    f.SetColStyle("Sheet1", "B", col) //注意顺序,excelize设置样式会覆盖
    f.SetCellStyle("Sheet1", "B1", "B1", B1)
    f.SetColStyle("Sheet1", "D", col)
    f.SetCellStyle("Sheet1", "D1", "D1", B1)
}

完整代码如下,github代码地址

package main

import (
	"fmt"
	"log"
	"strconv"
	"strings"

	"github.com/PuerkitoBio/goquery"
	"github.com/gocolly/colly"
	"github.com/xuri/excelize/v2"
)

type Hot struct {
	Movie_name string `selector:"div.item>div.info>div.hd>a>span:nth-child(1)"`
	Href       string `selector:"div.item>div.info>div.hd>a[href]"`
	Rating     string `selector:"div.item>div.info>div.bd>div.star>span.rating_num"`
	Playable   string `selector:"div.item>div.info>div.hd>span.playable"`
}

func main() {
	f, err := excelize.OpenFile("./films.xlsx")
	if err != nil && strings.Contains(err.Error(), "no such file") {
		f = WriteFile()
	}
	defer f.Close()
	hots := make([]*Hot, 0)
	c := colly.NewCollector(
		colly.AllowedDomains("movie.douban.com"),
	)
	c.OnResponse(func(r *colly.Response) {
		fmt.Println(r.StatusCode)
	})
	c.OnHTML("ol.grid_view>li", func(h *colly.HTMLElement) {
		hot := &Hot{}

		h.Unmarshal(hot)

		h.DOM.Find("div.item>div.info").Each(func(i int, s *goquery.Selection) {

			title := s.Find("div.hd>a")
			href, _ := title.Attr("href")
			hot.Href = href
			// movie_name := title.Find("span.title").Text()
			// playable := s.Find("div.hd>span.playable").Text()
			// rating := s.Find("div.bd>div.star>span.rating_num").Text()
			// hot.Rating = rating
			// fmt.Println("电影名:", movie_name, "评分:", rating, "地址:", href, playable)

		})

		hots = append(hots, hot)
	})
	c.Visit("https://movie.douban.com/top250")
	fmt.Println("host length:", len(hots))
	for index, val := range hots {
		f.SetSheetRow("Sheet1", "A"+strconv.Itoa(index+2), &[]any{val.Movie_name, val.Rating, val.Href, val.Playable})
	}
	f.Save()
}

func WriteFile() *excelize.File {
	f := excelize.NewFile()
	index := f.NewSheet("Sheet1")
	f.SetSheetRow("Sheet1", "A1", &[]any{"电影名", "评分", "地址", "是否可观看"})
	f.SetRowHeight("Sheet1", 1, 30)       //设置行高度
	f.SetColWidth("Sheet1", "A", "A", 40) //设置列宽度
	f.SetColWidth("Sheet1", "C", "C", 40)
	f.SetColWidth("Sheet1", "D", "D", 12)
	setTitleStyle(f)
	f.SetActiveSheet(index)
	if err := f.SaveAs("films.xlsx"); err != nil {
		log.Fatalln("err", err)
		panic(err)
	}
	return f
}

// 设置标题栏样式
func setTitleStyle(f *excelize.File) {
	A1, err := f.NewStyle(&excelize.Style{
		Alignment: &excelize.Alignment{Vertical: "center"}, //垂直居中
		Font:      &excelize.Font{Family: "黑体", Size: 14},
	})
	if err != nil {
		fmt.Println("set style error:", err)
	}
	B1, _ := f.NewStyle(&excelize.Style{
		Alignment: &excelize.Alignment{Vertical: "center", Horizontal: "center"}, //水平垂直居中
		Font:      &excelize.Font{Family: "黑体", Size: 14},
	})
	col, _ := f.NewStyle(&excelize.Style{
		Alignment: &excelize.Alignment{Vertical: "center", Horizontal: "center"}, //水平垂直居中
	})
	f.SetCellStyle("Sheet1", "A1", "A1", A1)
	f.SetCellStyle("Sheet1", "C1", "C1", A1)
	f.SetColStyle("Sheet1", "B", col) //注意顺序,excelize设置样式会覆盖
	f.SetCellStyle("Sheet1", "B1", "B1", B1)
	f.SetColStyle("Sheet1", "D", col)
	f.SetCellStyle("Sheet1", "D1", "D1", B1)
}