爬虫最简易实现

186 阅读1分钟

原理

在服务端模拟请求,不存在跨域问题

实现

let http = require('http');
let fs = require('fs')
let opts = {
  host:'news.baidu.com',
}
http.createServer(function (req,res) {
  let client = http.request(opts,function (r) {
    let arr= [];
    r.on('data',function (data) {
      arr.push(data);
    });
    r.on('end',function() {
      let result = Buffer.concat(arr).toString();
      console.log(result)
      let lis = result.match(/<li class="bold-item"(?:[\s\S]*?)<\/li>/img);
      res.setHeader('Content-Type','text/html;charset=utf8');
      fs.appendFileSync('./crawl.txt',lis);
      res.end('结束');
    })
  });
  client.end();
}).listen(3000)

curl模拟请求,然后爬虫开始,把爬取的内容放到crawl.txt文件,结束,返回

➜  July curl -v localhost:3000
* Rebuilt URL to: localhost:3000/
*   Trying 127.0.0.1...
* TCP_NODELAY set
* Connected to localhost (127.0.0.1) port 3000 (#0)
> GET / HTTP/1.1
> Host: localhost:3000
> User-Agent: curl/7.54.0
> Accept: */*
> 
< HTTP/1.1 200 OK
< Content-Type: text/html;charset=utf8
< Date: Wed, 22 Aug 2018 08:46:08 GMT
< Connection: keep-alive
< Content-Length: 9
< 
* Connection #0 to host localhost left intact
结束%  

本地会生成存储爬取内容的文件。