# /usr/bin/env python
# coding=utf8
'''python 爬取csdn 文章到 wordpress '''
import requests
import re
import json
import time
from bs4 import BeautifulSoup
from lxml import etree
from wordpress_xmlrpc import Client, WordPressPost
from wordpress_xmlrpc.methods.posts import NewPost
from csdn import CSDN
import pymysql
class WordPress:
def __init__(self):
self.wp = Client('http://blog.zxb8.cc/xmlrpc.php', 'username', 'password')
self.conn = pymysql.connect(host='104.224.151.80', port=3306, user='xxx', passwd='xx', db='blog',charset='utf8')
self.cursor = self.conn.cursor()
#通过 xmlrpc 方式导入数据到数据库
def sends(self,title,content):
# 链接 WordPress,输入 xmlrpc 链接,后台账号密码
post = WordPressPost()
post.title = title
# post.post_type=tag
post.content = content
post.post_status = 'publish'
# 发送到 WordPress
# print 'here3'
self.wp.call(NewPost(post))
time.sleep(3)
print('发布成功')
#导入数据
def create(self,url):
print(url)
csdn = CSDN(url)
title = csdn.getTitle()
content = csdn.getContent()
img = csdn.getImg()
print(img)
if len(img) >0:
content += " ".join(img)
#self.sends(title, content)
self.query(title,content,1)
time.sleep(3)
print('发布成功')
#通过 pymysql 数据驱动导入数据库
#根据 mysql binlog 日志分析出需要插入和更新的表
def query(self,title,content,cat):
#替换
content = content.replace("【工匠若水 http://blog.csdn.net/yanbober 未经允许严禁转载,请尊重作者劳动成果。私信联系我】","")
times = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
#设置高亮显示
content = '<pre class ="pure-highlightjs"> <code class ="">' + content +'</code> </pre>'
#转义
content = pymysql.escape_string(content)
#插入 post
sql_post = "INSERT INTO wp_posts(post_author,post_date,post_content,post_title,post_excerpt,post_status,comment_status,ping_status,post_name,to_ping,pinged,post_modified,post_content_filtered,post_parent,menu_order,post_type,comment_count) VALUES ('1','%s','%s','%s','','publish','open','open','%s','','','%s','','0','0','post','0')" % (
str(times), str(content), str(title), str(title), str(times))
self.cursor.execute(sql_post)
new_id = self.cursor.lastrowid
#更新 guid
guid = "http://blog.zxb8.cc/?p={}".format(new_id)
update_sql="UPDATE `wp_posts` SET `guid` = '%s' WHERE `ID` = %d" %(guid,new_id)
self.cursor.execute(update_sql)
#插入分类
sql_cat = "INSERT INTO wp_term_relationships(object_id,term_taxonomy_id,term_order) VALUES (%s,%s,'0')" % (new_id, cat)
self.cursor.execute(sql_cat)
#提交
self.conn.commit()
# self.cursor.close()
# self.conn.close()
if __name__ == '__main__':
headers={
"User-Agent": "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"
}
wordpress = WordPress()
list_url = "https://blog.csdn.net/yanbober/article/category/6971209"
response = requests.get(list_url, headers=headers)
selector = etree.HTML(response.text)
# for url in selector.xpath('//li[@class="blog-unit"]/a/@href'):
for url in selector.xpath('//div[@class="article_title"]//a/@href'):
print('正在努力爬取中...', url)
wordpress.create(url)
# i = 1
# while i<=5:
# url = "https://blog.csdn.net/mrlevo520/article/list/{}".format(i)
# i=i+1
# response = requests.get(list_url,headers=headers)
# selector = etree.HTML(response.text)
# for url in selector.xpath('//li[@class="blog-unit"]/a/@href'):
# print('正在努力爬取中...',url)
# wordpress.create(url)
#/usr/bin/env python
# -*- coding:utf-8 -*-
#https://blog.csdn.net/MrLevo520/article/details/53158050
import requests
import json
import os
from lxml import etree
import time
import random
from datetime import *
class CSDN():
def __init__(self,url):
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"
}
self.url = url
self.selector = etree.HTML(self.getHtml())
#获取正文内容
def getHtml(self):
response = requests.get(self.url)
return response.text
#获取标题
def getTitle(self):
# title = self.selector.xpath('//h1[@class="csdn_top"]/text()')
title = self.selector.xpath('//span[@class="link_title"]/a/text()')
return title[0].strip()
#获取标签内容
def getTag(self):
tags = self.selector.xpath('//div[@id="article_details"]/div[contains(@class,"article_manage")]//div[@class="article_l"]//a')
taglist = []
for tag in tags:
taglist.append(tag.xpath('./text()')[0])
return taglist
def getReadNum(self):
time = self.selector.xpath('//div[@id="article_details"]/div[contains(@class,"article_manage")]//div[@class="article_r"]/span[1]/text()')
read = self.selector.xpath('//div[@id="article_details"]/div[contains(@class,"article_manage")]//div[@class="article_r"]/span[2]/text()')
comment = self.selector.xpath('//div[@id="article_details"]/div[contains(@class,"article_manage")]//div[@class="article_r"]/span[3]/text()')
print(time)
print(read)
print(comment)
# 获取标签内容
def getContent(self):
content = self.selector.xpath('//div[@class="markdown_views"]')
#xpath 获取多个标签下的 text
return content[0].xpath('string(.)').strip()
#下载图片
def getImg(self):
imgs = self.selector.xpath('//div[@class="markdown_views"]//img/@src')
if len(imgs) < 0:
return;
# print(imgs)
list_imgs = []
#创建文件保持目录
upload = os.getcwd() + "/upload"
if not os.path.exists(upload):
os.mkdir(upload)
#下载图片并保存
for img_url in imgs:
response = requests.get(img_url,headers=self.headers)
nowTime = datetime.now().strftime("%Y%m%d%H%M%S") # 生成当前的时间
randomNum = random.randint(0, 100) # 生成随机数 n,其中 0<=n<=100
if randomNum <= 10:
randomNum = str(0) + str(randomNum)
file_name = str(nowTime) + str(randomNum)+'.jpg'
save_name = upload + '/' + file_name
print('download..',save_name)
with open(save_name,'wb') as f:
f.write(response.content)
#上传图片
remote_pic = self.upload(save_name)
if remote_pic:
img_src = '<img class ="alignnone size-medium" src="'+remote_pic+'" />'
list_imgs.append(img_src)
return list_imgs
#上传图片到图床,并返回图片地址
def upload(self,save_name):
url = 'https://sm.ms/api/upload'
# 上传图片
files = {'smfile': open(save_name, 'rb')}
data = {'ssl': False, 'format': 'json'}
response = requests.post(url, files=files, data=data)
result = response.text
# {'code': 'success', 'data': {'path': '/2018/04/19/5ad7fd2f7e60c.jpg', 'hash': 'vpw5S3armgducWz',
# 'url': 'https://i.loli.net/2018/04/19/5ad7fd2f7e60c.jpg', 'size': 215024,
# 'filename': '20160213173754690.jpg', 'storename': '5ad7fd2f7e60c.jpg',
# 'width': 1366, 'ip': '124.207.180.37', 'timestamp': 1524104495, 'height': 688,
#
# 'delete': 'https://sm.ms/delete/vpw5S3armgducWz'}}
result = json.loads(result)
print(result)
if result.get('code') == 'success':
return result['data']['url']
阅读原文请访问