GO语言小说排行榜爬虫实现

960 阅读2分钟
package main

import (
	"fmt"
	"io"
	"net/http"
	"os"
	"regexp"
	"strconv"
	"strings"
)

func formatStr(str* string) {
	switch len(*str) {
	case 3:
		*str=*str+"\t\t\t\t"
	case 6:
		*str=*str+"\t\t\t\t"
	case 7:
		*str=*str+"\t\t\t\t"
	case 8:
		*str=*str+"\t\t\t"
	case 9:
		*str=*str+"\t\t\t\t"
	case 10:
		*str=*str+"\t\t\t"
	case 11:
		*str=*str+"\t\t\t"
	case 12:
		*str=*str+"\t\t\t"
	case 15:
		*str=*str+"\t\t\t"
	case 18:
		*str=*str+"\t\t"
	case 21:
		*str=*str+"\t\t"
	case 24:
		*str=*str+"\t"
	case 27:
		*str=*str+"\t"
	}
}

type Novel struct {
	Rank int
	Name string
	author	string
	ticket string
	link string
}
func makeFile(path string, novelRank,novelName, author, ticket [][]string) {
	//打开文件
	fd,err:=os.OpenFile(path,os.O_APPEND,0664)
	defer fd.Close()
	if err!=nil {
		fmt.Println("os.Create err",err)
		return
	}
	novelMap:=make(map[string]string)

	line:=len(novelName)
	for i:=0;i<line ;i++  {
		//提取小说排名
		slicesNovelRank:=strings.Split(novelRank[i][0],">")
		slicesNovelRank=strings.Split(slicesNovelRank[1],"<")
		strNovelRank:=slicesNovelRank[0]

		//提取小说名字
		regName:=regexp.MustCompile(`bookName = ".*?"`)
		novelname:=regName.FindAllStringSubmatch(novelName[i][0],-1)
		strName:=strings.Split(novelname[0][0],"\"")

		//提取小说ID
		regID:=regexp.MustCompile(`bookId=".*?"`)
		novelID:=regID.FindAllStringSubmatch(novelName[i][0],-1)
		strID:=strings.Split(novelID[0][0],"\"")

		//将小说名和ID存入map
		novelMap[strName[1]]="http://book.zongheng.com/book/"+strID[1]+".html"
		name:=strName[1]
		formatStr(&name)

		//提取小说作者
		slicesAuthor:=strings.Split(author[i][0],"\"")
		strAuthor:=slicesAuthor[3]
		formatStr(&strAuthor)

		//提取小说月票
		slicesTicket:=strings.Split(ticket[i][0],">")
		slicesTicket=strings.Split(slicesTicket[1],"<")
		strTicket:=slicesTicket[0]

		fd.WriteString(strNovelRank+"\t\t\t\t"+name+"\t\t\t\t"+strAuthor+"\t\t"+strTicket+"\t\t\t\t"+novelMap[strName[1]]+"\n")
	}
}
func httpGetDB(url string)(result string,err error) {
	respond,err:=http.Get(url)
	if err != nil {
		fmt.Println("http.Get:",err)
	}
	defer respond.Body.Close()
	fmt.Printf("爬取的网页:%s\n",url)
	//循环读取网页数据
	for ; ; {
		buf:=make([]byte,4096)
		n,err2:=respond.Body.Read(buf)
		if n == 0 {
			break
		}
		if err2 != nil && err2 != io.EOF {
			fmt.Println("respond.Body.Read:",err)
			err=err2
			return
		}
		//将读取的数据存入result
		result+=string(buf[:n])
	}
	return
}

func SpliderPages(index int,ch chan<- int)  {
	//获取url
	url:="http://www.zongheng.com/rank/details.html?rt=1&d=1&p="+strconv.Itoa(index)
	//爬取网页
	result,err:=httpGetDB(url)
	if err != nil {
		fmt.Println("http.Get err:",err)
		return
	}

	//解析编译正则表达式->小说排名
	reg:=regexp.MustCompile(`<div class="rank_d_icon rank_d_b_num rank_d_b_num.*</div>`)
	novelRank:=reg.FindAllStringSubmatch(result,-1)

	//解析编译正则表达式->小说名称和ID
	reg1:=regexp.MustCompile(`div class="rank_d_list borderB_c_dsh clearfix".*>`)
	novelName:=reg1.FindAllStringSubmatch(result,-1)

	//解析编译正则表达式->小说作者
	reg2:=regexp.MustCompile(`"rank_d_b_cate" title=".*"`)
	novelAuthor:=reg2.FindAllStringSubmatch(result,-1)

	//解析编译正则表达式->小说月票
	reg3:=regexp.MustCompile(`<div class="rank_d_b_ticket">[0-9]+<span>月票</span></div>`)
	novelTicket:=reg3.FindAllStringSubmatch(result,-1)

	//将爬取的数据存入文件
	path:="C:/Users/yy/Desktop/小说排行榜爬虫.txt"
	makeFile(path,novelRank,novelName,novelAuthor,novelTicket)

}

func main() {
	var start,end int
	fmt.Print("请输入爬取的起始页:")
	fmt.Scan(&start)
	fmt.Print("请输入爬取的终止页:")
	fmt.Scan(&end)
	fmt.Printf("正在爬取第%d页到第%d页的数据\n",start,end)
	//阻塞主go程退出Channel
	block:=make(chan int)
	for i := start; i <= end; i++ {
		SpliderPages(i,block)
	}
	fmt.Println(len("849"))
}