持续创作,加速成长!这是我参与「掘金日新计划 · 10 月更文挑战」的第2天。
gocolly是go语言编写的比较出名的爬虫库,初次尝试,个人觉得colly的api简介明了,配合goquery功能强大。
安装
go-colly官网,官网目前我打开速度比较慢(正常上网情况下),有时候打不开。使用
go mod的话直接输入下面的命令.目前我使用的go版本是1.19
$ mkdir colly && cd colly
$ go mod init colly
$ go get -u github.com/gocolly/colly/...
$ go mod tidy
colly/main.go
主要尝试一下
colly,所以全部的代码就都写在main文件里了,爬虫库使用了colly;另外还使用了一个觉得不错的excel操作库excelize,excelize官方文档,文档上面写的很详细,需要使用excel可以看看🙃。直接开始上代码抓取豆瓣电影top25排行榜,你也可以爬top250😂 开始之前,先打开网页看一下豆瓣的html结构
从网页源代码上我们可以看出需要抓取的信息在
.item这个类选择器下,我们主要提取电影标题、豆瓣地址、豆瓣评分和是否可播放这几个信息,后续想要提取更多的可以自己扩展。 先定义一个结构体存储我们需要的信息
type Hot struct {
Movie_name string `selector:"div.item>div.info>div.hd>a>span:nth-child(1)"`
Href string `selector:"div.item>div.info>div.hd>a[href]"`
Rating string `selector:"div.item>div.info>div.bd>div.star>span.rating_num"`
Playable string `selector:"div.item>div.info>div.hd>span.playable"`
}
创建
colly collector去请求网页,等待响应后处理返回的响应结果
c := colly.NewCollector(
colly.AllowedDomains("movie.douban.com"),
)
收到响应后处理,因为返回的是
html格式,所以调用HTML回调处理
c.OnResponse(func(r *colly.Response) {
fmt.Println(r.StatusCode)
})
c.OnHTML("ol.grid_view>li", func(h *colly.HTMLElement) {
h.DOM.Find("div.item>div.info").Each(func(i int, s *goquery.Selection) {
title := s.Find("div.hd>a")
href, _ := title.Attr("href")
movie_name := title.Find("span.title").Text()
playable := s.Find("div.hd>span.playable").Text()
rating := s.Find("div.bd>div.star>span.rating_num").Text()
fmt.Println("电影名:", movie_name, "评分:", rating, "地址:", href, playable)
})
})
请求开始
c.Visit("https://movie.douban.com/top250")
fmt.Println("host length:", len(hots))
把结果存储到
excel
func WriteFile() *excelize.File {
f := excelize.NewFile()
index := f.NewSheet("Sheet1")
f.SetSheetRow("Sheet1", "A1", &[]any{"电影名", "评分", "地址", "是否可观看"})
f.SetRowHeight("Sheet1", 1, 30) //设置行高度
f.SetColWidth("Sheet1", "A", "A", 40) //设置列宽度
f.SetColWidth("Sheet1", "C", "C", 40)
f.SetColWidth("Sheet1", "D", "D", 12)
setTitleStyle(f)
f.SetActiveSheet(index)
if err := f.SaveAs("films.xlsx"); err != nil {
log.Fatalln("err", err)
panic(err)
}
return f
}
// 设置标题栏样式
func setTitleStyle(f *excelize.File) {
A1, err := f.NewStyle(&excelize.Style{
Alignment: &excelize.Alignment{Vertical: "center"}, //垂直居中
Font: &excelize.Font{Family: "黑体", Size: 14},
})
if err != nil {
fmt.Println("set style error:", err)
}
B1, _ := f.NewStyle(&excelize.Style{
Alignment: &excelize.Alignment{Vertical: "center", Horizontal: "center"}, //水平垂直居中
Font: &excelize.Font{Family: "黑体", Size: 14},
})
col, _ := f.NewStyle(&excelize.Style{
Alignment: &excelize.Alignment{Vertical: "center", Horizontal: "center"}, //水平垂直居中
})
f.SetCellStyle("Sheet1", "A1", "A1", A1)
f.SetCellStyle("Sheet1", "C1", "C1", A1)
f.SetColStyle("Sheet1", "B", col) //注意顺序,excelize设置样式会覆盖
f.SetCellStyle("Sheet1", "B1", "B1", B1)
f.SetColStyle("Sheet1", "D", col)
f.SetCellStyle("Sheet1", "D1", "D1", B1)
}
完整代码如下,
github代码地址
package main
import (
"fmt"
"log"
"strconv"
"strings"
"github.com/PuerkitoBio/goquery"
"github.com/gocolly/colly"
"github.com/xuri/excelize/v2"
)
type Hot struct {
Movie_name string `selector:"div.item>div.info>div.hd>a>span:nth-child(1)"`
Href string `selector:"div.item>div.info>div.hd>a[href]"`
Rating string `selector:"div.item>div.info>div.bd>div.star>span.rating_num"`
Playable string `selector:"div.item>div.info>div.hd>span.playable"`
}
func main() {
f, err := excelize.OpenFile("./films.xlsx")
if err != nil && strings.Contains(err.Error(), "no such file") {
f = WriteFile()
}
defer f.Close()
hots := make([]*Hot, 0)
c := colly.NewCollector(
colly.AllowedDomains("movie.douban.com"),
)
c.OnResponse(func(r *colly.Response) {
fmt.Println(r.StatusCode)
})
c.OnHTML("ol.grid_view>li", func(h *colly.HTMLElement) {
hot := &Hot{}
h.Unmarshal(hot)
h.DOM.Find("div.item>div.info").Each(func(i int, s *goquery.Selection) {
title := s.Find("div.hd>a")
href, _ := title.Attr("href")
hot.Href = href
// movie_name := title.Find("span.title").Text()
// playable := s.Find("div.hd>span.playable").Text()
// rating := s.Find("div.bd>div.star>span.rating_num").Text()
// hot.Rating = rating
// fmt.Println("电影名:", movie_name, "评分:", rating, "地址:", href, playable)
})
hots = append(hots, hot)
})
c.Visit("https://movie.douban.com/top250")
fmt.Println("host length:", len(hots))
for index, val := range hots {
f.SetSheetRow("Sheet1", "A"+strconv.Itoa(index+2), &[]any{val.Movie_name, val.Rating, val.Href, val.Playable})
}
f.Save()
}
func WriteFile() *excelize.File {
f := excelize.NewFile()
index := f.NewSheet("Sheet1")
f.SetSheetRow("Sheet1", "A1", &[]any{"电影名", "评分", "地址", "是否可观看"})
f.SetRowHeight("Sheet1", 1, 30) //设置行高度
f.SetColWidth("Sheet1", "A", "A", 40) //设置列宽度
f.SetColWidth("Sheet1", "C", "C", 40)
f.SetColWidth("Sheet1", "D", "D", 12)
setTitleStyle(f)
f.SetActiveSheet(index)
if err := f.SaveAs("films.xlsx"); err != nil {
log.Fatalln("err", err)
panic(err)
}
return f
}
// 设置标题栏样式
func setTitleStyle(f *excelize.File) {
A1, err := f.NewStyle(&excelize.Style{
Alignment: &excelize.Alignment{Vertical: "center"}, //垂直居中
Font: &excelize.Font{Family: "黑体", Size: 14},
})
if err != nil {
fmt.Println("set style error:", err)
}
B1, _ := f.NewStyle(&excelize.Style{
Alignment: &excelize.Alignment{Vertical: "center", Horizontal: "center"}, //水平垂直居中
Font: &excelize.Font{Family: "黑体", Size: 14},
})
col, _ := f.NewStyle(&excelize.Style{
Alignment: &excelize.Alignment{Vertical: "center", Horizontal: "center"}, //水平垂直居中
})
f.SetCellStyle("Sheet1", "A1", "A1", A1)
f.SetCellStyle("Sheet1", "C1", "C1", A1)
f.SetColStyle("Sheet1", "B", col) //注意顺序,excelize设置样式会覆盖
f.SetCellStyle("Sheet1", "B1", "B1", B1)
f.SetColStyle("Sheet1", "D", col)
f.SetCellStyle("Sheet1", "D1", "D1", B1)
}