阅读 419

python爬取csdn文章到wordpress

# /usr/bin/env python
# coding=utf8
'''python 爬取csdn 文章到 wordpress '''

import requests
import re
import json
import time
from bs4 import BeautifulSoup
from lxml import etree
from wordpress_xmlrpc import Client, WordPressPost
from wordpress_xmlrpc.methods.posts import NewPost
from csdn import CSDN
import pymysql

class WordPress:

    def __init__(self):
        self.wp = Client('http://blog.zxb8.cc/xmlrpc.php', 'username', 'password')
        self.conn = pymysql.connect(host='104.224.151.80', port=3306, user='xxx', passwd='xx', db='blog',charset='utf8')
        self.cursor = self.conn.cursor()

    #通过 xmlrpc 方式导入数据到数据库
    def sends(self,title,content):
        # 链接 WordPress,输入 xmlrpc 链接,后台账号密码
        post = WordPressPost()
        post.title = title
        # post.post_type=tag
        post.content = content
        post.post_status = 'publish'
        # 发送到 WordPress
        # print 'here3'
        self.wp.call(NewPost(post))
        time.sleep(3)
        print('发布成功')

    #导入数据
    def create(self,url):
        print(url)
        csdn = CSDN(url)
        title = csdn.getTitle()
        content = csdn.getContent()
        img = csdn.getImg()
        print(img)
        if len(img) >0:
            content += " ".join(img)
        #self.sends(title, content)
        self.query(title,content,1)
        time.sleep(3)
        print('发布成功')

    #通过 pymysql 数据驱动导入数据库
    #根据 mysql binlog 日志分析出需要插入和更新的表
    def query(self,title,content,cat):
        #替换
        content = content.replace("【工匠若水 http://blog.csdn.net/yanbober 未经允许严禁转载,请尊重作者劳动成果。私信联系我】","")

        times = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
        #设置高亮显示
        content = '<pre class ="pure-highlightjs"> <code class ="">' + content +'</code> </pre>'
        #转义
        content =  pymysql.escape_string(content)

        #插入 post
        sql_post = "INSERT INTO wp_posts(post_author,post_date,post_content,post_title,post_excerpt,post_status,comment_status,ping_status,post_name,to_ping,pinged,post_modified,post_content_filtered,post_parent,menu_order,post_type,comment_count) VALUES ('1','%s','%s','%s','','publish','open','open','%s','','','%s','','0','0','post','0')" % (
            str(times), str(content), str(title), str(title), str(times))
        self.cursor.execute(sql_post)
        new_id = self.cursor.lastrowid

        #更新 guid
        guid = "http://blog.zxb8.cc/?p={}".format(new_id)
        update_sql="UPDATE `wp_posts` SET `guid` = '%s' WHERE `ID` = %d" %(guid,new_id)
        self.cursor.execute(update_sql)

        #插入分类
        sql_cat = "INSERT INTO wp_term_relationships(object_id,term_taxonomy_id,term_order) VALUES (%s,%s,'0')" % (new_id, cat)
        self.cursor.execute(sql_cat)

        #提交
        self.conn.commit()
        # self.cursor.close()
        # self.conn.close()


if __name__ == '__main__':
    headers={
        "User-Agent": "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"
    }

    wordpress = WordPress()

    list_url = "https://blog.csdn.net/yanbober/article/category/6971209"

    response = requests.get(list_url, headers=headers)

    selector = etree.HTML(response.text)

    # for url in selector.xpath('//li[@class="blog-unit"]/a/@href'):
    for url in selector.xpath('//div[@class="article_title"]//a/@href'):
        print('正在努力爬取中...', url)
        wordpress.create(url)

    # i = 1
    # while i<=5:
    #     url = "https://blog.csdn.net/mrlevo520/article/list/{}".format(i)
    #     i=i+1
    #     response = requests.get(list_url,headers=headers)
    #     selector = etree.HTML(response.text)
    #     for url in selector.xpath('//li[@class="blog-unit"]/a/@href'):
    #         print('正在努力爬取中...',url)
    #         wordpress.create(url)


#/usr/bin/env python
#  -*- coding:utf-8 -*-

#https://blog.csdn.net/MrLevo520/article/details/53158050

import requests
import json
import os
from lxml import etree
import time
import random
from datetime import *


class CSDN():

    def __init__(self,url):
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"
        }
        self.url = url
        self.selector = etree.HTML(self.getHtml())

    #获取正文内容
    def getHtml(self):
        response = requests.get(self.url)
        return response.text

    #获取标题
    def getTitle(self):
        # title = self.selector.xpath('//h1[@class="csdn_top"]/text()')
        title = self.selector.xpath('//span[@class="link_title"]/a/text()')
        return title[0].strip()

    #获取标签内容
    def getTag(self):
        tags = self.selector.xpath('//div[@id="article_details"]/div[contains(@class,"article_manage")]//div[@class="article_l"]//a')
        taglist = []
        for tag in tags:
            taglist.append(tag.xpath('./text()')[0])
        return taglist


    def getReadNum(self):
        time = self.selector.xpath('//div[@id="article_details"]/div[contains(@class,"article_manage")]//div[@class="article_r"]/span[1]/text()')
        read = self.selector.xpath('//div[@id="article_details"]/div[contains(@class,"article_manage")]//div[@class="article_r"]/span[2]/text()')
        comment = self.selector.xpath('//div[@id="article_details"]/div[contains(@class,"article_manage")]//div[@class="article_r"]/span[3]/text()')
        print(time)
        print(read)
        print(comment)

    # 获取标签内容
    def getContent(self):
        content = self.selector.xpath('//div[@class="markdown_views"]')
        #xpath 获取多个标签下的 text
        return content[0].xpath('string(.)').strip()

    #下载图片
    def getImg(self):
        imgs = self.selector.xpath('//div[@class="markdown_views"]//img/@src')
        if len(imgs) < 0:
            return;
        # print(imgs)
        list_imgs = []
        #创建文件保持目录
        upload = os.getcwd() + "/upload"
        if not os.path.exists(upload):
            os.mkdir(upload)
        #下载图片并保存
        for img_url in imgs:
            response = requests.get(img_url,headers=self.headers)
            nowTime = datetime.now().strftime("%Y%m%d%H%M%S")  # 生成当前的时间
            randomNum = random.randint(0, 100)  # 生成随机数 n,其中 0<=n<=100
            if randomNum <= 10:
                randomNum = str(0) + str(randomNum)
            file_name = str(nowTime) + str(randomNum)+'.jpg'
            save_name = upload + '/' + file_name
            print('download..',save_name)
            with open(save_name,'wb') as f:
                f.write(response.content)
            #上传图片
            remote_pic = self.upload(save_name)
            if remote_pic:
                img_src = '<img class ="alignnone size-medium" src="'+remote_pic+'"  />'
                list_imgs.append(img_src)
        return list_imgs


    #上传图片到图床,并返回图片地址
    def upload(self,save_name):
        url = 'https://sm.ms/api/upload'
        # 上传图片
        files = {'smfile': open(save_name, 'rb')}

        data = {'ssl': False, 'format': 'json'}

        response = requests.post(url, files=files, data=data)

        result = response.text

        # {'code': 'success', 'data': {'path': '/2018/04/19/5ad7fd2f7e60c.jpg', 'hash': 'vpw5S3armgducWz',
        #                              'url': 'https://i.loli.net/2018/04/19/5ad7fd2f7e60c.jpg', 'size': 215024,
        #                              'filename': '20160213173754690.jpg', 'storename': '5ad7fd2f7e60c.jpg',
        #                              'width': 1366, 'ip': '124.207.180.37', 'timestamp': 1524104495, 'height': 688,
        #
        #                         'delete': 'https://sm.ms/delete/vpw5S3armgducWz'}}

        result = json.loads(result)
        print(result)

        if result.get('code') == 'success':
            return result['data']['url']
复制代码

阅读原文请访问

www.zxb8.cc