from selenium import webdriver
import selenium.webdriver.support.ui as ui
import re
import time
import pymysql
driver = webdriver.Chrome()
wait = ui.WebDriverWait(driver, 10)
def getPage():
print('getPage')
number = 0
texts = driver.find_element_by_xpath("//div[@id='pageBox']").text
print('页码', texts)
m = re.findall(r'(\w*[0-9]+)\w*', texts)
print('页数:' + str(m[1]))
return int(m[1])
def main():
count = len(open("Blog_URL.txt", 'rU').readlines())
print(count)
n = 0
urlfile = open("Blog_URL.txt", 'r')
while n < count:
url = urlfile.readline()
url = url.strip("\n")
print(url)
driver.get(url)
allPage = getPage()
print(u'页码总数为:', allPage)
time.sleep(1)
try:
conn = pymysql.connect(host='localhost', user='root',
passwd='123456', port=3306, db='test01')
cur = conn.cursor()
conn.set_charset('utf8')
cur.execute('SET NAMES utf8;')
cur.execute('SET CHARACTER SET utf8;')
cur.execute('SET character_set_connection=utf8;')
m = 1
while m <= allPage:
ur = url + "/article/list/" + str(m)
print(ur)
driver.get(ur)
article_title = driver.find_elements_by_xpath("//div[@class='article-item-box csdn-tracking-statistics']")
for title in article_title:
print(url)
con = title.find_element_by_tag_name("h4").text
con = con.strip("\n")
print(con + '\n')
article_description = driver.find_elements_by_xpath("//div/p[@class='content']")
for description in article_description:
con = description.text
con = con.strip("\n")
print(con + '\n')
article_manage = driver.find_elements_by_xpath("//div[@class='info-box d-flex align-content-center']")
for manage in article_manage:
con = manage.text
con = con.strip("\n")
print(con + '\n')
num = 0
print(u'长度', len(article_title))
while num < len(article_title):
sql = '''insert into csdn
(URL,Author,Artitle,Description,Manage,FBTime,YDNum,PLNum)
values(%s, %s, %s, %s, %s, %s, %s, %s)'''
Artitle = article_title[num].find_element_by_tag_name("h4").text
Description = article_description[num].text
Manage = article_manage[num].text
print("Artitle: ",Artitle)
print("Description: ",Description)
print("Manage: ",Manage)
Author = url.split('/')[-1]
mode = re.compile(r'\d+\.?\d*')
YDNum = mode.findall(Manage)[-2]
PLNum = mode.findall(Manage)[-1]
print("YDNum: ",YDNum)
print("PLNum: ", PLNum)
end = Manage.find(u' 阅读')
FBTime = Manage[:19]
cur.execute(sql, (url, Author, Artitle, Description, Manage, FBTime, YDNum, PLNum))
num = num + 1
else:
print (u'本页数据库插入成功')
m = m + 1
except pymysql.Error as e:
print ("Mysql Error %d: %s" % (e.args[0], e.args[1]))
finally:
cur.close()
conn.commit()
conn.close()
n = n + 1
else:
urlfile.close()
print ('Load Over')
main()