爬虫

110 阅读2分钟
import requests
from lxml import etree
from db import saveFoodData 

#  m.hiyd.com 

def foodSpider( index, page):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20',
        'Referer' : 'http://m.food.hiyd.com/',
    }
    url = 'http://m.food.hiyd.com/food/list/?group_id=%s&kind=1&page=%s&_loader=1&_from=ajax' %(index ,page )
    print( url )
    #  http://m.food.hiyd.com/food/list/?group_id=%s&kind=1&page=1&_loader=1&_from=ajax

    req = requests.get(url, headers=headers, timeout=10)
    response = req.content
    # print( req.encoding )
    selector = etree.HTML(response, parser = etree.HTMLParser(encoding='utf8'))

    list_name = selector.xpath('//ul[@id="foodList"]/li/a/div[@class="cont"]/h3/text()')
    list_name = [ i.split(',')[0]   for i in list_name  ]
    list_img = selector.xpath('//ul[@id="foodList"]/li/a/div[@class="img-wrap"]/img/@src')
    list_link = selector.xpath('//ul[@id="foodList"]/li/a/@href')
    # print( list_name, list_img , list_link )
    # list_link = [  parse.urljoin(url,i) for i in list_link ]

    def  getdetail(url):
        print(  url  )
        res = requests.get( "http:"+ url, headers=headers, timeout=10)
        text = res.content
        html = etree.HTML(text, parser = etree.HTMLParser(encoding='utf8'))
         
        heat = html.xpath('//div[@class="box-row2"]/div[1]/p/em/text()')[0].strip()  if  html.xpath('//div[@class="box-row2"]/div[1]/p/em/text()') else "未知" 
        prot = html.xpath('//div[@class="box-row2"]/div[2]/p/em/text()')[0].strip()  if html.xpath('//div[@class="box-row2"]/div[2]/p/em/text()') else "未知" 
        carbon = html.xpath('//div[@class="box-row2"]/div[3]/p/em/text()')[0].strip()  if html.xpath('//div[@class="box-row2"]/div[3]/p/em/text()') else "未知"
        zhi = html.xpath('//div[@class="box-row2"]/div[4]/p/em/text()')[0].strip() if html.xpath('//div[@class="box-row2"]/div[4]/p/em/text()') else "未知"
        info =  etree.tostring(html.xpath('//div[@class="box-bd"]')[2],encoding='utf8'   ).decode('utf-8')    if len(html.xpath('//div[@class="box-bd"]'))>2 else  etree.tostring(html.xpath('//div[@class="box-bd"]')[1],encoding='utf8'   ).decode('utf-8')
        return (heat,prot,carbon,zhi,info)

    details = []
    if list_link:
        details = [ getdetail(link) for link in list_link  ] 

    return zip( list_name,list_img, details ) if details else None 
    
if __name__ == '__main__':
    page = [1,2,4,5,6,7,8,910,132 ]
    name = [ '主食','肉蛋类','奶类及制品','蔬果','坚果豆制品','饮料','食用油','调味品','零食','其它','菜肴']  
    for p in page:
        for i in range(1,5):  #5为5页 
            try:
                res = foodSpider(p,i)
                print(res)
                saveFoodData(  page.index(p)+1, res)  
            except Exception as e:
                print(e)  
import requests
from lxml import etree
from  urllib import parse
from db import saveSportsData 

#  m.hiyd.com 

def sportSpider( page ):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20',
        'Referer' : 'http://m.hiyd.com/',
    }
    url = 'http://m.hiyd.com/dongzuo/?gender=1&page=%s&_loader=1&_from=ajax' %page 

    req = requests.get(url, headers=headers, timeout=10)
    response = req.content
    # print( req.encoding )
    selector = etree.HTML(response, parser = etree.HTMLParser(encoding='utf8'))

    list_name = selector.xpath('//ul[@class="main-list "]/li/a/div[@class="item-cont"]/h3/text()')
    list_img = selector.xpath('//ul[@class="main-list "]/li/a/div[@class="item-pic"]/img/@src')
    list_link = selector.xpath('//ul[@class="main-list "]/li/a/@href')   
    
    list_link = [  parse.urljoin(url,i) for i in list_link ]
    # print( list_name, list_img , list_link )

    def  getdetail(url):
        print(  url  )
        res = requests.get(  url, headers=headers, timeout=10)
        text = res.content
        html = etree.HTML(text, parser = etree.HTMLParser(encoding='utf8'))
   
        video = html.xpath('//div[@class="action-video"]/video/@src')[0].strip() 

        skill = html.xpath('//div[@class="action-detail"]/pre/text()')[0].strip() 
        info =  etree.tostring(html.xpath('//div[@class="info-bd"]')[0],encoding='utf8'   ).decode('utf-8')   
        return ( video,skill,info)

    details = []
    if list_link:
        details = [ getdetail(link) for link in list_link  ] 

    return zip( list_name,list_img, details ) if details else None 
    
if __name__ == '__main__':
    page = 30 
    for p in range(1,page): 
        try:
            res = sportSpider(p)  
            # print(res)
            saveSportsData( res ) 
        except Exception as e:
            print(e)