import requests
from lxml import etree
from db import saveFoodData
def foodSpider( index, page):
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh
'Referer' : 'http://m.food.hiyd.com/',
}
url = 'http://m.food.hiyd.com/food/list/?group_id=%s&kind=1&page=%s&_loader=1&_from=ajax' %(index ,page )
print( url )
req = requests.get(url, headers=headers, timeout=10)
response = req.content
selector = etree.HTML(response, parser = etree.HTMLParser(encoding='utf8'))
list_name = selector.xpath('//ul[@id="foodList"]/li/a/div[@class="cont"]/h3/text()')
list_name = [ i.split(',')[0] for i in list_name ]
list_img = selector.xpath('//ul[@id="foodList"]/li/a/div[@class="img-wrap"]/img/@src')
list_link = selector.xpath('//ul[@id="foodList"]/li/a/@href')
def getdetail(url):
print( url )
res = requests.get( "http:"+ url, headers=headers, timeout=10)
text = res.content
html = etree.HTML(text, parser = etree.HTMLParser(encoding='utf8'))
heat = html.xpath('//div[@class="box-row2"]/div[1]/p/em/text()')[0].strip() if html.xpath('//div[@class="box-row2"]/div[1]/p/em/text()') else "未知"
prot = html.xpath('//div[@class="box-row2"]/div[2]/p/em/text()')[0].strip() if html.xpath('//div[@class="box-row2"]/div[2]/p/em/text()') else "未知"
carbon = html.xpath('//div[@class="box-row2"]/div[3]/p/em/text()')[0].strip() if html.xpath('//div[@class="box-row2"]/div[3]/p/em/text()') else "未知"
zhi = html.xpath('//div[@class="box-row2"]/div[4]/p/em/text()')[0].strip() if html.xpath('//div[@class="box-row2"]/div[4]/p/em/text()') else "未知"
info = etree.tostring(html.xpath('//div[@class="box-bd"]')[2],encoding='utf8' ).decode('utf-8') if len(html.xpath('//div[@class="box-bd"]'))>2 else etree.tostring(html.xpath('//div[@class="box-bd"]')[1],encoding='utf8' ).decode('utf-8')
return (heat,prot,carbon,zhi,info)
details = []
if list_link:
details = [ getdetail(link) for link in list_link ]
return zip( list_name,list_img, details ) if details else None
if __name__ == '__main__':
page = [1,2,4,5,6,7,8,910,132 ]
name = [ '主食','肉蛋类','奶类及制品','蔬果','坚果豆制品','饮料','食用油','调味品','零食','其它','菜肴']
for p in page:
for i in range(1,5):
try:
res = foodSpider(p,i)
print(res)
saveFoodData( page.index(p)+1, res)
except Exception as e:
print(e)
import requests
from lxml import etree
from urllib import parse
from db import saveSportsData
def sportSpider( page ):
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh
'Referer' : 'http://m.hiyd.com/',
}
url = 'http://m.hiyd.com/dongzuo/?gender=1&page=%s&_loader=1&_from=ajax' %page
req = requests.get(url, headers=headers, timeout=10)
response = req.content
selector = etree.HTML(response, parser = etree.HTMLParser(encoding='utf8'))
list_name = selector.xpath('//ul[@class="main-list "]/li/a/div[@class="item-cont"]/h3/text()')
list_img = selector.xpath('//ul[@class="main-list "]/li/a/div[@class="item-pic"]/img/@src')
list_link = selector.xpath('//ul[@class="main-list "]/li/a/@href')
list_link = [ parse.urljoin(url,i) for i in list_link ]
def getdetail(url):
print( url )
res = requests.get( url, headers=headers, timeout=10)
text = res.content
html = etree.HTML(text, parser = etree.HTMLParser(encoding='utf8'))
video = html.xpath('//div[@class="action-video"]/video/@src')[0].strip()
skill = html.xpath('//div[@class="action-detail"]/pre/text()')[0].strip()
info = etree.tostring(html.xpath('//div[@class="info-bd"]')[0],encoding='utf8' ).decode('utf-8')
return ( video,skill,info)
details = []
if list_link:
details = [ getdetail(link) for link in list_link ]
return zip( list_name,list_img, details ) if details else None
if __name__ == '__main__':
page = 30
for p in range(1,page):
try:
res = sportSpider(p)
saveSportsData( res )
except Exception as e:
print(e)