爬虫爬热榜

151 阅读1分钟

-- codeing = utf-8 --

@Time : 2021/1/26 10:19

@Author : 老七疯狂吸氧

@file hotlist1.py

@Software:PyCharm

import requests import re import time import urllib.parse def main(): urllist={} t = time.strftime('%Y-%m-%d', time.localtime(time.time())) 虎嗅网热文榜 = "tophub.today/n/5VaobgvAj…" 微博今日热榜 = "tophub.today/n/KqndgxeLl…" 知乎热榜 = "tophub.today/n/mproPpoq6…" B站日榜 = "tophub.today/n/74Kvxwokx…" six氪日榜 = "tophub.today/n/Q1Vd5Ko85…" 吾爱破解日榜 = "tophub.today/n/NKGoRAzel…" 豆瓣电影新片榜 = "tophub.today/n/mDOvnyBoE…" csdn技术区热帖 = "tophub.today/n/K7GdajgeQ…" urllist.update(微博今日热榜=[微博今日热榜,50],虎嗅网热文榜=[虎嗅网热文榜,15],csdn技术区热帖=[csdn技术区热帖,50],知乎热榜=[知乎热榜,50],B站日榜=[B站日榜,100],six氪日榜=[six氪日榜,10],吾爱破解日榜=[吾爱破解日榜,15],豆瓣电影新片榜=[豆瓣电影新片榜,10]) #将排行榜的网站和数量加入字典。

for key,value in urllist.items():

    datalist = get_html(value[0])
    hotname = saveurl(datalist)
    keys = list(hotname.keys())
    values = list(hotname.values())
    n=0
    for i in range(0,value[1]):
        n+=1
        content = str(n)+"."+keys[i]+"  "+values[i][0]+"  "+values[i][1]
        savelist(content,key,t)
        # print(n,".",keys[i],"  ",values[i][0],"  ",values[i][1])
    print("爬取", key, "完毕")

def get_html(url): #一次请求 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36' } #请输入你个人的User-Agent response = requests.get(url, headers=headers) return response.text def saveurl(baseurl): #筛选内容 findlink=re.compile(r'(.?)') findlink2=re.compile(r'(.?)') findlink3=re.compile(r'<a href="(.*?)" target=') cid = re.findall(findlink,baseurl) hot = re.findall(findlink2,baseurl) url = re.findall(findlink3,baseurl) m=0 for i in url: url[m]=splicing(url[m]) m+=1 news = {} j=0 for i in hot: news.update({cid[j]:[i,url[j]]}) j+=1 # cid = list(cid)[1] return news def savelist(list,name,t): #存储数据 txtname=name+t File = open(txtname+".txt", "a", encoding="utf-8") File.writelines(list+"\n") File.close() def splicing(get_url): url = 'tophub.today' next_url = urllib.parse.urljoin(url , get_url ) return next_url if name == 'main': main()