本文已参与「新人创作礼」活动,一起开启掘金创作之路。
前言
因为课程课题需要,需爬取采购网的数据。不得不一夜速成python,终于从几十个代码中找到了个能用的,在使用过程中遇到了很多bug,写一下详细的使用方法,需要自取
这个爬出来的结果是往数据库中存的,我用的是phpstudy的数据库工具sql_Front,这个可以导出excel文件方便统计
建议看一下后面的注意事项,大佬就不用看了,写给我这样小白看的
这个还有很多不足,有大佬帮助完善下就感激不尽了
目标网站
http://search.ccgp.g删o除v.cn/bxsearch
脚本内容
mysql.py
import pymysql
class MySQL():
def __init__(self):
try:
self.db = pymysql.connect(host='127.0.0.1', user='123', password='123456', database='crwal_gov', charset='utf8', port=3306) # 数据库信息
self.cursor = self.db.cursor()
except pymysql.MySQLError as e:
print(e.args)
def insert(self, table, data):
#print(data.keys())
# lst = []
# for key in data.keys():
# lst.append(key)
# print(lst)
keys = ','.join(data.keys())
values = ','.join(['%s'] * len(data))
sql_query = 'insert into %s (%s) values (%s)' % (table, keys, values)
try:
self.cursor.execute(sql_query, tuple(data.values()))
self.db.commit()
except pymysql.MySQLError as e:
print(e.args)
self.db.rollback()
zhengfucaigouspider.py
#
# _ooOoo_
# o8888888o
# 88" . "88
# (| -_- |)
# O\ = /O
# ____/`---'\____
# .' \\| |// `.
# / \\||| : |||// \
# / _||||| -:- |||||- \
# | | \\\ - /// | |
# | \_| ''\---/'' | |
# \ .-\__ `-` ___/-. /
# ___`. .' /--.--\ `. . __
# ."" '< `.___\_<|>_/___.' >'"".
# | | : `- \`.;`\ _ /`;.`/ - ` : | |
# \ \ `-. \_ __\ /__ _/ .-` / /
# ======`-.____`-.___\_____/___.-`____.-'======
# `=---='
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# Buddha Bless, No Bug !
#
#coding:utf-8
import datetime
import json
import re
import threading
import time
import requests
from lxml import etree
from mysql import MySQL
class ZhenfucaigouSpider():
url = 'http://search.ccgp.gov.cn/bxsearch?searchtype=1&displayZone=广东&zoneId=44+not+4403'
keyword = '医院'
start_time = '2020:03:28'
end_time = '2021:03:28'
# displayZone = '广东'
# zoneId = '44 not 4403'
page_num = 1
params = {
'searchtype': '1',
'page_index': page_num,
'bidSort': '0',
'pinMu': '0',
'bidType': '0',
'kw': keyword,
'start_time': start_time,
'end_time': end_time,
# 'displayZone': displayZone,
# 'zoneId' : zoneId,
'timeType': '6'
}
headers = {
'Cookie': 'JSESSIONID=EgPd86-6id_etA2QDV31Kks3FrNs-4gwHMoSmEZvnEktWIakHbV3!354619916; Hm_lvt_9f8bda7a6bb3d1d7a9c7196bfed609b5=1545618390; Hm_lpvt_9f8bda7a6bb3d1d7a9c7196bfed609b5=1545618390; td_cookie=2144571454; Hm_lvt_9459d8c503dd3c37b526898ff5aacadd=1545611064,1545618402,1545618414; Hm_lpvt_9459d8c503dd3c37b526898ff5aacadd=1545618495',
'Host': 'search.ccgp.gov.cn',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3141.8 Safari/537.36'
}
mysql = MySQL()
def get_page(self,url,headers,params):
try:
response = requests.get(url,headers=headers,params=params)
if response.status_code == 200:
html = response.content.decode('utf-8', 'ignore').replace(u'\xa9', u'')
#print(html)
return html
else:
print(response.status_code)
except requests.ConnectionError:
return None
def get_detail_page(self,url):
try:
response = requests.get(url)
if response.status_code == 200:
html = response.content.decode('utf-8', 'ignore').replace(u'\xa9', u'')
#print(html)
return html
except requests.ConnectionError:
return None
def get_all_url(self,html):
pattern1 = '<.*?(href=".*?htm").*?'
href_url = re.findall(pattern1, html, re.I)
# print(href_url)
url_list = []
for url in href_url:
url1 = url.replace('href=','').replace('"','')
url_list.append(url1)
return url_list
def parse_datail_page(self,html):
table_list = html.xpath('//div[@class="table"]//tr')
#print(table_list)
all_info = {}
for table in table_list:
if len(table.xpath('td[@class="title"]/text()'))>0:
# print(''.join(table.xpath('td[@class="title"]/text()'))+":"+''.join(table.xpath('td[@colspan="3"]/text()')))
title = ''.join(table.xpath('td[@class="title"]/text()'))
value = ''.join(table.xpath('td[@colspan="3"]/text()'))
if (title.find('附件')==0):
value = 'http://www.ccgp.gov.cn/oss/download?uuid='+''.join(table.xpath('td[@colspan="3"]/a/@id'))
#print(title+value)
if ('公告时间' in title):
title = '公告时间'
value = table.xpath('td[@width="168"]/text()')[1]
district_key = '行政区域'
district_value = (table.xpath('td[@width="168"]/text()'))[0]
all_info[district_key]=district_value
if '本项目招标公告日期中标日期' in title :
title = '本项目招标公告日期'
value = table.xpath('td[@width="168"]/text()')[0]
zhongbiaoriqi_key = '中标日期'
zhongbiaoriqi_value = table.xpath('td[@width="168"]/text()')[1]
all_info[zhongbiaoriqi_key]=zhongbiaoriqi_value
#print('中标日期'+zhongbiaoriqi_value)
if '本项目招标公告日期成交日期' in title:
title = '本项目招标公告日期'
value = table.xpath('td[@width="168"]/text()')[0]
zhongbiaoriqi_key = '中标日期'
zhongbiaoriqi_value = ''.join(table.xpath('td[@width="168"]/text()'))[11:]
#print('zhongbiaoriqi_value:'+zhongbiaoriqi_value)
all_info[zhongbiaoriqi_key] = zhongbiaoriqi_value
all_info[title] = value
all_info['插入时间']= datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
return all_info
#return json.dumps(all_info,ensure_ascii=False)
def start(self,url):
time.sleep(3)
# print(url)
html = self.get_detail_page(url)
# print(html)
html = etree.HTML(html)
all_info = self.parse_datail_page(html)
print(all_info)
#print(all_info.keys())
self.mysql.insert('zhenfucaigou',all_info)
def run(self):
for i in range(1,200):
print('正在爬取第{}页'.format(str(i)))
self.params['page_index']=i
html = self.get_page(url=self.url, headers=self.headers, params=self.params)
# print(html)
url_list = self.get_all_url(html)
# 创建线程
threads = []
files = range(len(url_list))
for url in url_list:
t = threading.Thread(target=self.start(url), args=url)
threads.append(t)
# 启动线程
for i in files:
threads[i].start()
for i in files:
threads[i].join()
if __name__ == '__main__':
zhenfucaigouSpider = ZhenfucaigouSpider()
zhenfucaigouSpider.run()
注意事项
mysql.py
配置好数据库信息,两个py放在同一个目录下
提前创建好crwal_gov数据库(crwal_gov可以根据自己需求进行修改)
zhengfucaigouspider.py
keyword = 关键词
指定时间 :start_time = 起始时间,end_time = 结束时间。时间跨度不允许超过一年,否则会没数据
displayZone = 地区。不过好像还有个zoneId = 编号,这样操作是无效的就注释掉了。
因为是get传参,我就直接加在url里面了,要是改的话把url中的广东改成其他省份,把44+not+4403改为对应省份的代码(参考下面zoneid的值)
有个问题是数据库,要提前创建好字段,脚本不会自动创建字段……(对不起我还没学会)
因为公布概要的内容都大差不离的,需要的字段也就那么几(十)个
可以参考我的字段,除了这些还有一个自增id,同时也要注意py脚本运行的提示(会提示没有哪些字段,及时补充,不然数据插入不进去)
还有个问题,不会自动终止,循环爬最后一页的数据……
解决办法就是,提前搜索看好一共有几页,及时制止代码。假设n页,提示开始爬n+1页的时候就终止。
我遇到的问题大概就这么多,再有报错就百度解决吧,应该问题都不大了。
抓取效果
来看一下我的效果
python运行界面:
数据库:
导出的excel: