首先,要 明确自己要爬取什么内容 ,需求明确了,后面才能有的放矢;然后,要 分析目标网站,包括 URL 结构,HTML 页面,网络请求及返回的结果等,目的就是要找到我们要爬取的目标位于网站的哪一个位置,我们如何才能获取到,从而指定爬取策略;接下来,就是 编码环节 了,使用代码去发起网络请求,解析网页内容,提取目标数据,并进行数据存储
分析目标网站
- URL组成结构
发现这个地址并没有新闻列表,审查元素可以发现真正的新闻列表地址
分析网页HTML结构
- 新闻详情页
可以找到标题 id="lblNewTitle"和内容id="lblNewContent"
html解析的方法有很多,这里使用 BeautifulSoup获取我们需要的信息
2.新闻列表页面
这里可以找到每一面有多少条新闻,每一条新闻对应的url,以及总共有多少页。
翻页的时候发现当我们点击下一页的时候,url并没有改变
查看网页源码可以发现,翻页只是发送了一个post请求获取数据
<body>
<form name="form1" method="post" action="./TradeNews.aspx" id="form1">
<div>
<script type="text/javascript">
var theForm = document.forms['form1'];
if (!theForm) {
theForm = document.form1;
}
function __doPostBack(eventTarget, eventArgument) {
if (!theForm.onsubmit || (theForm.onsubmit() != false)) {
theForm.__EVENTTARGET.value = eventTarget;
theForm.__EVENTARGUMENT.value = eventArgument;
theForm.submit();
}
}
</script>
我们可以发现点击翻页的时候刚好有一个TradeNews.aspx的请求,刚好对应上边的form action
点进去可以看到 请求Headers
Request URL:www.cnce7.com/index/Trade…
Request Method: POST
Form Data
可以看到__EVENTARGUMENT刚好对应第几页
__VIEWSTATE:一长串不知道啥的东西
__VIEWSTATEGENERATOR: 7053E3F7
__EVENTTARGET: objPageControl
__EVENTARGUMENT: 2
__VIEWSTATEENCRYPTED:
至此,我们已经可以自己实现post请求了
url = 'http://www.cnce7.com/index/TradeNews.aspx'
formdata = {'type': 'index',
'__VIEWSTATEGENERATOR': '7053E3F7',
'__EVENTTARGET': 'objPageControl',
'__EVENTARGUMENT': 2,
'__VIEWSTATEENCRYPTED': ''}
response = requests.post(url, data=formdata) # 返回Response对象
由于不知道__VIEWSTATE那一长串是啥,所以一开始没有发送这个数据,结果发现每次请求返回的数据都是第一页,这有点把萌新难住了,差点就要用第二种方式selenium模拟点击按钮来实现了。
但是这个数据是可以通过解析网页内容拿到的,查看网页源码可以发现
解析代码
html = response.text
soup = BeautifulSoup(html, 'lxml')
status = soup.find('input', id='__VIEWSTATE')
post提交数据的时候,加上这个果然没有问题了。
制定爬取策略
通过分析 URL 组成结构以及目标网站的 HTML 结构,我们已经完成了爬虫的前期准备工作,接下来需要根据网站的特点制定相应的爬取策略,进行编码实现。
- 遍历每一页,将每一条新闻的的链接保存到一个list
- 依次访问每一条新闻并解析,然后保存到数据库
编码实现
- 导入用到的库
import datetime
import requests
from bs4 import BeautifulSoup
from pymysql_comm import UsingMysql
其中,requests 库主要用来发起网络请求,并接收服务器返回的数据;BeautifulSoup库主要用来解析 html 内容,是一个非常简单好用的库;pymysql主要用于将数据输出到数据库,关于pymysql的使用,参考这篇文章。
- 访问url,获取所有新闻链接
def get_news_list(url, page_num, newsUrlList):
# 获取翻页要用到的__VIEWSTATE
try:
response = requests.get(url, timeout=(3, 7)) # 返回Response对象
# print(response.status_code)这句话似乎只能显示正常状态码200,异常状态码都不能显示,待查
response.raise_for_status() # 若状态码不是200,抛出HTTPError异常
response.encoding = response.apparent_encoding # 保证页面编码正确
html = response.text
soup = BeautifulSoup(html, 'lxml')
status = soup.find('input', id='__VIEWSTATE')
# print(status['value'])
except requests.exceptions.ConnectTimeout:
print('超时!')
except requests.exceptions.ConnectionError:
print('无效地址!')
for i in range(0, page_num + 1):
formdata = {'type': 'index',
'__VIEWSTATE': status['value'],
'__VIEWSTATEGENERATOR': '7053E3F7',
'__EVENTTARGET': 'objPageControl',
'__EVENTARGUMENT': page_num - i,
'__VIEWSTATEENCRYPTED': ''}
try:
response = requests.post(url, data=formdata) # 返回Response对象
# print(response.status_code)这句话似乎只能显示正常状态码200,异常状态码都不能显示,待查
response.raise_for_status() # 若状态码不是200,抛出HTTPError异常
response.encoding = response.apparent_encoding # 保证页面编码正确
html = response.text
soup = BeautifulSoup(html, 'lxml')
# print(html)
allNews = soup.find('div', class_='xinwen').find_all('li')
for li in allNews:
newsId = li.find('a')['href']
print(newsId)
newsUrlList.append("http://www.cnce7.com/index/" + newsId)
# save_news("http://www.cnce7.com/index/" + newsId)
except requests.exceptions.ConnectTimeout:
print('超时!')
except requests.exceptions.ConnectionError:
print('无效地址!')
- 获取新闻内容
def get_news_details(url, newsInfo):
try:
response = requests.get(url, timeout=(3, 7)) # 返回Response对象
response.raise_for_status() # 若状态码不是200,抛出HTTPError异常
response.encoding = response.apparent_encoding # 保证页面编码正确
html = response.text
soup = BeautifulSoup(html, 'lxml')
newsInfo['title'] = soup.find_all(id="lblNewTitle")[0].get_text()
newsInfo['source'] = soup.find_all(id="lblCreateByName")[0].get_text()
newsInfo['author'] = ""
newsInfo['pubdate'] = soup.find_all(id="lblCreateTime")[0].get_text()
newsInfo['content'] = soup.find_all(id="lblNewContent")[0]
except requests.exceptions.ConnectTimeout:
print('超时!')
except requests.exceptions.ConnectionError:
print('无效地址!')
print("查到新闻:", newsInfo['title'])
print(">>>>>>>>>>>>>>>>>>>>>>", datetime.datetime.now())
- 保存到数据库
def save_news(urlList):
with UsingMysql(log_time=True) as um:
sql = "insert into tab_news(title,source,ndate,author,content) values(%s,%s,%s,%s,%s)"
for url in urlList:
news = {'title': '', 'source': '', 'pubdate': '', 'content': '', 'author': ''}
get_news_details(url, news)
params = (
str(news['title']), str(news['source']), str(news['pubdate']), str(news['author']), str(news['content']))
um.cursor.execute(sql, params)
print("保存>>>>>>>>>>>>>>>>>>", datetime.datetime.now())
这里用到一个自己封装的工具类
import pymysql
from timeit import default_timer
host = 'localhost'
port = 3306
db = 'cc7_ru'
user = 'root'
password = 'root'
# ---- 用pymysql 操作数据库
def get_connection():
print("连接数据库")
conn = pymysql.connect(host=host, port=port, db=db, user=user, password=password)
return conn
# ---- 使用 with 的方式来优化代码
class UsingMysql(object):
def __init__(self, commit=True, log_time=True, log_label='总用时'):
"""
:param commit: 是否在最后提交事务(设置为False的时候方便单元测试)
:param log_time: 是否打印程序运行总时间
:param log_label: 自定义log的文字
"""
self._log_time = log_time
self._commit = commit
self._log_label = log_label
def __enter__(self):
# 如果需要记录时间
if self._log_time is True:
self._start = default_timer()
# 在进入的时候自动获取连接和cursor
conn = get_connection()
cursor = conn.cursor(pymysql.cursors.DictCursor)
conn.autocommit = False
self._conn = conn
self._cursor = cursor
return self
def __exit__(self, *exc_info):
# 提交事务
if self._commit:
self._conn.commit()
# 在退出的时候自动关闭连接和cursor
self._cursor.close()
self._conn.close()
if self._log_time is True:
diff = default_timer() - self._start
print('-- %s: %.6f 秒' % (self._log_label, diff))
@property
def cursor(self):
return self._cursor
完整代码
import datetime
import requests
from bs4 import BeautifulSoup
from pymysql_comm import UsingMysql
def select_one(cursor):
cursor.execute("select * from tab_news")
data = cursor.fetchone()
print("-- 单条记录: {0} ".format(data))
# 新增单条记录
def create_one(title, source, pubdate, author, content):
with UsingMysql(log_time=True) as um:
# print(um.cursor)
sql = "insert into tab_news(title,source,ndate,author,content) values(%s,%s,%s,%s,%s)"
params = (str(title), str(source), str(pubdate), str(author), str(content))
um.cursor.execute(sql, params)
# 查看结果
# select_one(um.cursor)
def save_news(urlList):
with UsingMysql(log_time=True) as um:
sql = "insert into tab_news(title,source,ndate,author,content) values(%s,%s,%s,%s,%s)"
for url in urlList:
news = {'title': '', 'source': '', 'pubdate': '', 'content': '', 'author': ''}
get_news_details(url, news)
params = (
str(news['title']), str(news['source']), str(news['pubdate']), str(news['author']), str(news['content']))
um.cursor.execute(sql, params)
print("保存>>>>>>>>>>>>>>>>>>", datetime.datetime.now())
# 爬取一条新闻
def get_news_details(url, newsInfo):
try:
response = requests.get(url, timeout=(3, 7)) # 返回Response对象
response.raise_for_status() # 若状态码不是200,抛出HTTPError异常
response.encoding = response.apparent_encoding # 保证页面编码正确
html = response.text
soup = BeautifulSoup(html, 'lxml')
newsInfo['title'] = soup.find_all(id="lblNewTitle")[0].get_text()
newsInfo['source'] = soup.find_all(id="lblCreateByName")[0].get_text()
newsInfo['author'] = ""
newsInfo['pubdate'] = soup.find_all(id="lblCreateTime")[0].get_text()
newsInfo['content'] = soup.find_all(id="lblNewContent")[0]
except requests.exceptions.ConnectTimeout:
print('超时!')
except requests.exceptions.ConnectionError:
print('无效地址!')
print("查到新闻:", newsInfo['title'])
print(">>>>>>>>>>>>>>>>>>>>>>", datetime.datetime.now())
def get_news_list(url, page_num, newsUrlList):
# 获取翻页要用到的__VIEWSTATE
try:
response = requests.get(url, timeout=(3, 7)) # 返回Response对象
# print(response.status_code)这句话似乎只能显示正常状态码200,异常状态码都不能显示,待查
response.raise_for_status() # 若状态码不是200,抛出HTTPError异常
response.encoding = response.apparent_encoding # 保证页面编码正确
html = response.text
soup = BeautifulSoup(html, 'lxml')
status = soup.find('input', id='__VIEWSTATE')
# print(status['value'])
except requests.exceptions.ConnectTimeout:
print('超时!')
except requests.exceptions.ConnectionError:
print('无效地址!')
for i in range(0, page_num + 1):
formdata = {'type': 'index',
'__VIEWSTATE': status['value'],
'__VIEWSTATEGENERATOR': '7053E3F7',
'__EVENTTARGET': 'objPageControl',
'__EVENTARGUMENT': page_num - i,
'__VIEWSTATEENCRYPTED': ''}
try:
response = requests.post(url, data=formdata) # 返回Response对象
# print(response.status_code)这句话似乎只能显示正常状态码200,异常状态码都不能显示,待查
response.raise_for_status() # 若状态码不是200,抛出HTTPError异常
response.encoding = response.apparent_encoding # 保证页面编码正确
html = response.text
soup = BeautifulSoup(html, 'lxml')
# print(html)
allNews = soup.find('div', class_='xinwen').find_all('li')
for li in allNews:
newsId = li.find('a')['href']
print(newsId)
newsUrlList.append("http://www.cnce7.com/index/" + newsId)
# save_news("http://www.cnce7.com/index/" + newsId)
except requests.exceptions.ConnectTimeout:
print('超时!')
except requests.exceptions.ConnectionError:
print('无效地址!')
if __name__ == '__main__':
url = 'http://www.cnce7.com/index/TradeNews.aspx'
page = 40
newsUrlList = []
# 查询指定url的page页新闻url,保存到list
get_news_list(url, page, newsUrlList)
save_news(newsUrlList)