简易版本
都写在 main.go
初始化项目
go mod init crawler
抓取网页内容
package main
import (
"fmt"
"io/ioutil"
"net/http"
)
func main() {
resp, err := http.Get(
"http://www.zhenai.com/zhenghun",
)
if err != nil {
panic(err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
fmt.Println("Error", resp.StatusCode)
return
}
all, err := ioutil.ReadAll(resp.Body)
if err != nil {
panic(err)
}
fmt.Printf("%s\n", all)
}
判断meta是否是gbk,是就转化utf8
package main
import (
"bufio"
"fmt"
"io"
"io/ioutil"
"net/http"
"golang.org/x/text/encoding"
"golang.org/x/text/transform"
"golang.org/x/net/html/charset"
)
func main() {
resp, err := http.Get(
"http://www.zhenai.com/zhenghun",
)
if err != nil {
panic(err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
fmt.Println("Error", resp.StatusCode)
return
}
e := determineEncoding(resp.Body)
utf8Reader := transform.NewReader(resp.Body, e.NewDecoder())
all, err := ioutil.ReadAll(utf8Reader)
if err != nil {
panic(err)
}
fmt.Printf("%s\n", all)
// encoding.Encoding()
}
func determineEncoding(r io.Reader) encoding.Encoding {
bytes, err := bufio.NewReader(r).Peek(1024)
if err != nil {
panic(err)
}
e, _, _ := charset.DetermineEncoding(bytes, "")
return e
}
正则获取城市列表
新增方法
func ParserCityList(content []byte) {
//错误写法
// const reg = "<a href='(http://www.zhenai.com/zhenghun/[a-zA-Z0-9]+)'[^>]*>([^<]+)</a>"
// 正确写法
const reg = `<a href="(http://www.zhenai.com/zhenghun/[0-9a-z]+)"[^>]*>([^<]+)</a>`
re := regexp.MustCompile(reg)
matches := re.FindAllSubmatch(content, -1)
for _, m := range matches {
fmt.Printf("City: %s, URL: %s\n", m[2], m[1])
}
}
完成代码
package main
import (
"bufio"
"fmt"
"io"
"io/ioutil"
"net/http"
"regexp"
"golang.org/x/text/encoding"
"golang.org/x/text/transform"
"golang.org/x/net/html/charset"
)
func main() {
resp, err := http.Get(
"http://www.zhenai.com/zhenghun",
)
if err != nil {
panic(err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
fmt.Println("Error", resp.StatusCode)
return
}
e := determineEncoding(resp.Body)
utf8Reader := transform.NewReader(resp.Body, e.NewDecoder())
all, err := ioutil.ReadAll(utf8Reader)
if err != nil {
panic(err)
}
// fmt.Printf("%s\n", all)
ParserCityList(all)
}
func determineEncoding(r io.Reader) encoding.Encoding {
bytes, err := bufio.NewReader(r).Peek(1024)
if err != nil {
panic(err)
}
e, _, _ := charset.DetermineEncoding(bytes, "")
return e
}
func ParserCityList(content []byte) {
//错误写法
// const reg = "<a href='(http://www.zhenai.com/zhenghun/[a-zA-Z0-9]+)'[^>]*>([^<]+)</a>"
// 正确写法
const reg = `<a href="(http://www.zhenai.com/zhenghun/[0-9a-z]+)"[^>]*>([^<]+)</a>`
re := regexp.MustCompile(reg)
matches := re.FindAllSubmatch(content, -1)
for _, m := range matches {
fmt.Printf("City: %s, URL: %s\n", m[2], m[1])
}
}
优化项目结构
爬城市列表
项目总览
engine
type.go
package engine
type Request struct {
Url string
ParserFunc func([]byte) ParserResult
}
type ParserResult struct {
Requests []Request
Items []interface{}
}
func NilParser([]byte) ParserResult {
return ParserResult{}
}
engine.go
package engine
import (
"crawler/fetcher"
"log"
)
func Run(seeds ...Request) {
var requests []Request
for _, r := range seeds {
requests = append(requests, r)
}
for len(requests) > 0 {
r := requests[0]
requests = requests[1:]
log.Printf("Fetching %s", r.Url)
body, err := fetcher.Fetch(r.Url)
if err != nil {
log.Printf("Fetcher: error"+"fetching url %s: %v", r.Url, err)
continue
}
parserResult := r.ParserFunc(body)
requests = append(requests, parserResult.Requests...)
for _, item := range parserResult.Items {
log.Printf("Got item %v", item)
}
}
}
fetcher
fetcher.go
package fetcher
import (
"bufio"
"fmt"
"io"
"io/ioutil"
"log"
"net/http"
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/unicode"
"golang.org/x/text/transform"
"golang.org/x/net/html/charset"
)
func Fetch(url string) ([]byte, error) {
resp, err := http.Get(url)
if err != nil {
return nil, err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return nil,
fmt.Errorf("wrong status code: %d", resp.StatusCode)
}
e := determineEncoding(resp.Body)
utf8Reader := transform.NewReader(resp.Body, e.NewDecoder())
return ioutil.ReadAll(utf8Reader)
}
func determineEncoding(r io.Reader) encoding.Encoding {
bytes, err := bufio.NewReader(r).Peek(1024)
if err != nil {
log.Printf("Fetcher error: %v", err)
return unicode.UTF8
}
e, _, _ := charset.DetermineEncoding(bytes, "")
return e
}
parser
zhenai/parser/citylist.go
package parser
import (
"crawler/engine"
"regexp"
)
const cityListRe = `<a href="(http://www.zhenai.com/zhenghun/[0-9a-z]+)"[^>]*>([^<]+)</a>`
func ParseCityList(contents []byte) engine.ParserResult {
re := regexp.MustCompile(cityListRe)
matches := re.FindAllSubmatch(contents, -1)
result := engine.ParserResult{}
for _, m := range matches {
result.Items = append(result.Items, string(m[2]))
result.Requests = append(
result.Requests, engine.Request{
Url: string(m[1]),
ParserFunc: engine.NilParser,
})
}
return result
}
main.go
package main
import (
"crawler/engine"
"crawler/zhenai/parser"
)
func main() {
engine.Run(engine.Request{
Url: "http://www.zhenai.com/zhenghun",
ParserFunc: parser.ParseCityList,
})
}