Nodejs爬虫实战(三)

127 阅读1分钟

1. 抽取函数处理

  1. 引入模块

  2. http协议和https协议两种,既有不同,那么模块引入创建的变量自然不同

  3. url对象的parse方法能获得http或者https协议的信息。以http://example.com:8080/one?a为例打印。

     {
         protocol : 'http:' ,
         auth : null ,
         host : 'example.com:8080' ,
         port : '8080' ,
         hostname : 'example.com' ,
         hash : null ,
         search : '?a=index&t=article&m=default',
         query : 'a=index&t=article&m=default',
         pathname : '/one',
         path : '/one?a=index&t=article&m=default',
         href : 'http://example.com:8080/one?a=index&t=article&m=default'
     }
    
  4. protocol属性保存了协议

     if(urlObj.protocol == 'http:'){
     	http = require('http');
     }
     else{
     	http = require('https');
     }
    
  5. 处理error页面

     req.on('error',()=>{
     	console.log('404');
     })
    

    ####### 完整代码

     const fs = require('fs');
     const url = require('url')
     GetUrl('https://detail.tmall.com/item.htm?spm=a230r.1.14.6.68624507tWuF7E&id=560257961625&cm_id=140105335569ed55e27b&abbucket=18&sku_properties=10004:709990523',data=>{
         fs.writeFile('iponex.html',data);
     })
     function GetUrl(sUrl,success){
     	var urlObj = url.parse(sUrl);
     	var http ='';
     	if(urlObj.protocol == 'http:'){
     		http = require('http');
     	}
     	else{
     		http = require('https');
     	}
     
     	let req = http.request({
     		'hostname':urlObj.hostname,
     		'path':urlObj.path
     	},res=>{
     		console.log(res)
     		
     		var arr = [];
     		res.on('data',buffer=>{
     			arr.push(buffer);
     		});
     		res.on('end',()=>{
     			let b = Buffer.concat(arr);
     			success && success(b);
     		})
     		
     	});
     
     	req.end();
     	req.on('error',()=>{
     		console.log('404');
     	})
     }