python 爬虫

132 阅读2分钟
# coding=utf-8

from selenium import webdriver
import selenium.webdriver.support.ui as ui
import re
import time
import pymysql

# 打开浏览器 设定等待加载时间
driver = webdriver.Chrome()
wait = ui.WebDriverWait(driver, 10)


# 获取每个博主的博客页面低端总页码
def getPage():
    print('getPage')
    number = 0
    texts = driver.find_element_by_xpath("//div[@id='pageBox']").text
    print('页码', texts)
    m = re.findall(r'(\w*[0-9]+)\w*', texts)  # 正则表达式寻找数字
    print('页数:' + str(m[1]))
    return int(m[1])


# 主函数
def main():
    # 获取txt文件总行数
    count = len(open("Blog_URL.txt", 'rU').readlines())
    print(count)
    n = 0
    urlfile = open("Blog_URL.txt", 'r')

    # 循环获取每个博主的文章摘信息
    while n < count:  # 这里爬取count个博主信息
        url = urlfile.readline()
        url = url.strip("\n")
        print(url)

        # 浏览器端获取url 网页元素
        driver.get(url)
        # 获取总页码
        allPage = getPage()
        print(u'页码总数为:', allPage)

        time.sleep(1)

        # 数据库操作结合
        try:
            conn = pymysql.connect(host='localhost', user='root',
                                   passwd='123456', port=3306, db='test01')
            cur = conn.cursor()  # 数据库游标
            # mysql连接初始化设置
            conn.set_charset('utf8')
            cur.execute('SET NAMES utf8;')
            cur.execute('SET CHARACTER SET utf8;')
            cur.execute('SET character_set_connection=utf8;')

            # 具体内容处理
            m = 1  # 第1页
            while m <= allPage:
                ur = url + "/article/list/" + str(m)
                print(ur)

                driver.get(ur)

                # 标题
                article_title = driver.find_elements_by_xpath("//div[@class='article-item-box csdn-tracking-statistics']")
                # article_title = driver.find_element_by_tag_name("h4")
                for title in article_title:
                    print(url)
                    con = title.find_element_by_tag_name("h4").text
                    con = con.strip("\n")
                    print(con + '\n')

                # 摘要
                article_description = driver.find_elements_by_xpath("//div/p[@class='content']")
                for description in article_description:
                    con = description.text
                    con = con.strip("\n")
                    print(con + '\n')

                # 信息
                article_manage = driver.find_elements_by_xpath("//div[@class='info-box d-flex align-content-center']")
                for manage in article_manage:
                    con = manage.text
                    con = con.strip("\n")
                    print(con + '\n')

                num = 0
                print(u'长度', len(article_title))
                while num < len(article_title):
                    # 插入数据 8个值
                    sql = '''insert into csdn
                                (URL,Author,Artitle,Description,Manage,FBTime,YDNum,PLNum)
                            values(%s, %s, %s, %s, %s, %s, %s, %s)'''
                    Artitle = article_title[num].find_element_by_tag_name("h4").text
                    Description = article_description[num].text
                    Manage = article_manage[num].text
                    print("Artitle: ",Artitle)

                    print("Description: ",Description)

                    print("Manage: ",Manage)

                    # 获取作者
                    Author = url.split('/')[-1]
                    # 正则表达式获取阅读数和评论数
                    mode = re.compile(r'\d+\.?\d*')
                    YDNum = mode.findall(Manage)[-2]
                    PLNum = mode.findall(Manage)[-1]
                    print("YDNum: ",YDNum)

                    print("PLNum: ", PLNum)
                    # 获取发布时间
                    end = Manage.find(u' 阅读')
                    FBTime = Manage[:19]
                    cur.execute(sql, (url, Author, Artitle, Description, Manage, FBTime, YDNum, PLNum))

                    num = num + 1
                else:
                    print (u'本页数据库插入成功')

                m = m + 1


        # 异常处理
        except pymysql.Error as e:
            print ("Mysql Error %d: %s" % (e.args[0], e.args[1]))

        finally:
            cur.close()
            conn.commit()
            conn.close()

        n = n + 1

    else:
        urlfile.close()
        print ('Load Over')


main()