Pycharm中的Python包在 file–>Setting–>Project:lx2–>Project Interpretr下
完整代码展示
import os
import re
import jieba
import urllib.request
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from urllib.request import urlopen
from urllib import parse
def loadPage(url, filename):
"""
作用:根据url发送请求,获取服务器响应文件
url: 需要爬取的url地址
filename : 处理的文件名
"""
print("正在下载 " + filename)
html = urlopen(url).read().decode("utf-8")
return html
def writePage(html, filename):
"""
作用:将html内容写入到本地
html:服务器相应文件内容
"""
print("正在保存 " + filename)
# 文件写入
with open(new_file_name+kw+filename, "w", encoding="utf-8") as f:
f.write(html)
print("-" * 30)
def tiebaSpider(url, beginPage, endPage):
"""
作用:贴吧爬虫调度器,负责组合处理每个页面的url
url : 贴吧url的前部分
beginPage : 起始页
endPage : 结束页
"""
for page in range(beginPage, endPage + 1):
pn = (page - 1) * 50
filename = "第" + str(page) + "页.html"
fullurl = url + "&pn=" + str(pn)
print(fullurl)
html = loadPage(fullurl, filename)
writePage(html, filename)
def newpage(url):
"""
作用:获取贴吧最新消息并展示出来
fullurl : 完全连接
data_topic_name:最新主题
data_made_time:创建时间
data_main_info:
"""
fullurl = url + key + "&ie=utf-8&pn="
header = ("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:62.0) Gecko/20100101 Firefox/62.")
urllib.request.build_opener().addheaders = [header]
urllib.request.install_opener(urllib.request.build_opener())
data = urllib.request.urlopen(fullurl).read().decode()
data_topic_name = re.compile('title="主题作者:(.*?)"').findall(data)
data_made_time = re.compile('title="创建时间">(.*?)</span>').findall(data)
data_main_info = re.compile('<a rel="noreferrer" href="/p/.*?" title="(.*?)"').findall(data)
data_end_name = re.compile('title="最后回复人:(.*?)"').findall(data)
data_img_picture = re.compile('class="thumbnail vpic_wrap"><img src="" attr=".*?" data-original="(.*?)"').findall(data)
for picture in range(0, len(data_img_picture)):
imgurl = data_img_picture[picture]
file_name = new_file_name+kw+str(picture) + ".jpg"
urllib.request.urlretrieve(imgurl, filename=file_name)
mademakdowntitle(biaochang)
markdownbiao(biaochang, data_made_time)
markdownbiao(biaochang, data_topic_name)
markdownbiao(biaochang, data_main_info)
markdownbiao(biaochang, data_end_name)
tiezhun = "热门贴主"
tiezi = "热门帖子"
huoyue = "贴吧活跃"
tiebayuntu(data_topic_name, tiezhun)
tiebayuntu(data_main_info, tiezi)
tiebayuntu(data_end_name, huoyue)
def markdownbiao(biaochang,dataname):
"""
作用:判断目录是否存
fullurl : 完全连接
data_topic_name:最新主题
data_made_time:创建时间
data_main_info:主体信息
"""
i = 0
for i in range(0,biaochang):
if i == 0:
file.write(str(jianduan))
elif i < biaochang:
file.write(dataname[i-1]+str(jianduan))
else:
file.write(str(jianduan))
file.write("\n")
def mademakdowntitle(biaochang):
for biaoge in range(0, biaochang):
if biaoge == 0:
file.write(jianduan)
elif biaoge < biaochang:
file.write(str(biaoge) + jianduan)
else:
file.write(jianduan)
file.write("\n")
for timebiao in range(0, biaochang - 1):
if timebiao == 0:
file.write(jianduan)
elif timebiao < biaochang:
file.write("----" + jianduan)
else:
file.write(jianduan)
file.write("\n")
def mknewdir(name):
"""
作用:判断目录是否存
fullurl : 完全连接
data_topic_name:最新主题
data_made_time:创建时间
data_main_info:主体信息
"""
isExists = os.path.exists(name)
if not isExists:
# 如果不存在则创建目录
# 创建目录操作函数
os.makedirs(name)
print(name + "目录创建成功")
return True
else:
# 如果目录存在则不创建,并提示目录已存在
print(name + "目录目录已存在")
return False
def tiebayuntu(data_name,picture_name):
text = str(data_name)
cut_text = jieba.cut(text)
result = " ".join(cut_text)
wc = WordCloud(
font_path='FZMengRTJW.TTF', # 字体路劲
background_color='white', # 背景颜色
width=1920, #输出图片的宽度
height=1080, #输出图片的高度
max_font_size=100, # 最大字体大小
min_font_size=10, # 最小
# mask=plt.imread('./jingyu.png'), # 背景图片
max_words=1000
)
wc.generate(result)
wc.to_file(new_file_name+kw+picture_name+".png")
plt.figure('贴吧热热门贴主')
plt.axis('off')
if __name__ == "__main__":
#输入部分
kw = input("请输入需要爬取的贴吧名:")
new_file_name = "./"+kw+"/"
# 创建目录
mknewdir(new_file_name)
beginPage = int(input("请输入起始页:"))
endPage = int(input("请输入结束页:"))
#如果只爬取首页 则可自定义表长 否则默认为30
if (beginPage == 1)&(endPage == 1):
biaochang = int(input("请输markdown入表长"))
else:
biaochang = 30
file = open(new_file_name+kw+"yanshi.md", "w", encoding="utf-8")
jianduan = "|"
biaochang = biaochang + 1
url = "http://tieba.baidu.com/f?"
key = parse.urlencode({"kw": kw}) # .encode("utf-8")
fullurl = url + key
print("key=", key)
newpage(url)
tiebaSpider(fullurl, beginPage, endPage)
html = urlopen("http://tieba.baidu.com/f?kw=python&pn=100")
file.close()
关键代码解读
if __name__ == "__main__":
#输入部分
kw = input("请输入需要爬取的贴吧名:")
new_file_name = "./"+kw+"/"
# 创建目录
mknewdir(new_file_name)
beginPage = int(input("请输入起始页:"))
endPage = int(input("请输入结束页:"))
#如果只爬取首页 则可自定义表长 否则默认为30
if (beginPage == 1)&(endPage == 1):
biaochang = int(input("请输markdown入表长"))
else:
biaochang = 30
file = open(new_file_name+kw+"yanshi.md", "w", encoding="utf-8")
jianduan = "|"
biaochang = biaochang + 1
url = "http://tieba.baidu.com/f?"
key = parse.urlencode({"kw": kw}) # .encode("utf-8")
fullurl = url + key
print("key=", key)
newpage(url)
tiebaSpider(fullurl, beginPage, endPage)
html = urlopen("http://tieba.baidu.com/f?kw=python&pn=100")
if name == “ main “: 这个是类似于C语言的int main(),也就是程序入口,在Python中如果有这一段代码,则把Python当做程序来执行,若没有则当做脚本按行执行
fullurl = url + key + "&ie=utf-8&pn="
header = ("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:62.0) Gecko/20100101 Firefox/62.")
urllib.request.build_opener().addheaders = [header]
urllib.request.install_opener(urllib.request.build_opener())
data = urllib.request.urlopen(fullurl).read().decode()
data_topic_name = re.compile('title="主题作者:(.*?)"').findall(data)
data_made_time = re.compile('title="创建时间">(.*?)</span>').findall(data)
data_main_info = re.compile('<a rel="noreferrer" href="/p/.*?" title="(.*?)"').findall(data)
data_end_name = re.compile('title="最后回复人:(.*?)"').findall(data)
data_img_picture = re.compile('class="thumbnail vpic_wrap"><img src="" attr=".*?" data-original="(.*?)"').findall(data)
在以上的代码中
header = ("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:62.0) Gecko/20100101 Firefox/62.")
urllib.request.build_opener().addheaders = [header]
是模拟浏览器去操作获取网页中的数据,而非脚本
urllib.request.install_opener(urllib.request.build_opener())
data = urllib.request.urlopen(fullurl).read().decode()
则是将网页中的数据读取到data中
data_topic_name = re.compile('title="主题作者:(.*?)"').findall(data)
则是在data也就是所有的网页数据中去匹配title=”主题作者:”后面的数据,获取到就写在data_topic_name中后面所有的操作均是基于它
python中把筛选出的数据写入到markdown表格中
def markdownbiao(biaochang,dataname):
"""
作用:判断目录是否存
fullurl : 完全连接
data_topic_name:最新主题
data_made_time:创建时间
data_main_info:主体信息
"""
i = 0
for i in range(0,biaochang):
if i == 0:
file.write(str(jianduan))
elif i < biaochang:
file.write(dataname[i-1]+str(jianduan))
else:
file.write(str(jianduan))
file.write("\n")
这段代码是将python中的数据,写入成markdown中并以表格格式写入
def mademakdowntitle(biaochang):
for biaoge in range(0, biaochang):
if biaoge == 0:
file.write(jianduan)
elif biaoge < biaochang:
file.write(str(biaoge) + jianduan)
else:
file.write(jianduan)
file.write("\n")
for timebiao in range(0, biaochang - 1):
if timebiao == 0:
file.write(jianduan)
elif timebiao < biaochang:
file.write("----" + jianduan)
else:
file.write(jianduan)
file.write("\n")
这段代码是根据输入的表长创建Markdown表格的头部
完整项目代码获取点这里