Python数据分析——爬虫相关

215 阅读2分钟

爬数据笔记

1.认识网页结构

网页结构:html、css、jscript

1.html
<html>..</html>网页
<body>..</body>用户可见内容
<div>..</div>框架
<p>..</p>段落
<li>..</li>列表
<img>..</img>图片
<h1>..</h1>标题
<a href=''>..</a>超链接

2.css
表示样式

3.jscript
表示功能,包含交互内容和特效。

2.要安装的包

网页请求工具:requests
解析工具:lxml/beautifulsoup和bs4/pyquery
存储工具:pymysql/pymongo/redis

3.向网站发送请求

第一步:获得网页源码
import requests
url='http://www.cntour.cn/'
response1 = requests.get(url)
response2 = requests.post(url,data=dic)#抓取网页源码,post请求必须构建请求头,dic为字典或json串都行

d = response.json()#进行json decode并返回字典
response.headers#获得头部信息
r.cookies['example_cookie_name']#获取响应

注:应对反爬虫需要构造user agent、设置时间间隔或者构造代理池
#构造代理
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'}
response = requests.get(url,headers=headers)
#每隔3秒进行一次
import time
time.sleep(3)
#构建自己的代理池
proxies={
"http":"http://10.10.1.10:3128",
"https":"http://10.10.1.10:1080",
}
response = requests.get(url, proxies=proxies)


第二步:解析网页BeautifulSoup,要安装bs4和BeautifulSoup,及lxml
from bs4 import BeautifulSoup
soup=BeautifulSoup(response.text,'lxml')#指定lxml解析,还支持html5lib和html.parser,转换成树形结构,并且每个节点都是 Python 对象

data = soup.select('#main>div>div.mtop.firstMod.clearfix>div.centerBox>ul.newsList>li>a')#在浏览器的开发者模式中找到对应的文本copy selector复制,并通过select找到目标
html代码
for item in data:
    result={
    'title':item.get_text(),
    'link':item.get('href')
            }
print(result)#就提取到了目标:标题和链接

上述是直接很容易分析网页源码的情况,当源码复杂的时候就需要分析网页结构,运用BeautifulSoup:
content=response.content.decode('utf-8')#获取二进制字节内容
soup=BeautifulSoup(content,'html.parser')
listA=soup.find_all(name='script',attrs={'id':'getArearStar'})
listB = soup.find_all(name='script', attrs={"id": "getListByCountryTypeService2true"})
account = str(listA)
world_messages = str(listB)[95:-21]
messages = account[52:-21]
print(messages)
messages_json = json.loads(messages)
print(world_messages)
world_messages_json = json.loads(messages)
valuesList = []
cityList = []
worldList = []
now_time = datetime.datetime.now().strftime('%Y-%m-%d')

for k in range(len(world_messages_json)):
    worldvalue = (now_time,
                  world_messages_json[k].get('countryType'),
                  world_messages_json[k].get('continents'),
                  world_messages_json[k].get('provenceId'),
                  world_messages_json[k].get('provinceName'),
                  world_messages_json[k].get('provinceShortName'),
                  world_messages_json[k].get('cityName'),
                  world_messages_json[k].get('currentConfirmedCount'),
                  world_messages_json[k].get('confirmedCount'),
                  world_messages_json[k].get('suspectedCount'),
                  world_messages_json[k].get('curedCount'),
                  world_messages_json[k].get('deadCount'),
                  world_messages_json[k].get('locationId'),
                  world_messages_json[k].get('countryShortCode'),)
    worldList.append(worldvalue)
print(worldList)
for i in range(len(messages_json)):
    value = (now_time,
             messages_json[k].get('provinceName'),
             messages_json[k].get('provinceShortName'),
             messages_json[k].get('currentConfirmedCount'),
             messages_json[k].get('confirmedCount'),
             messages_json[k].get('suspectedCount'),
             messages_json[k].get('curedCount'),
             messages_json[k].get('deadCount'),
             messages_json[k].get('comment'),
             messages_json[k].get('locationId'),
             messages_json[k].get('statisticsData'),)
    valuesList.append(value)
    cityValue = messages_json[i].get('cities')
print(valuesList)
print(cityValue)
for j in range(len(cityValue)):
        cityValueList = (
            cityValue[j].get('cityName'), cityValue[j].get('currentConfirmedCount'),
            cityValue[j].get('confirmedCount'), cityValue[j].get('suspectedCount'),
            cityValue[j].get('curedCount'), cityValue[j].get('deadCount'), cityValue[j].get('locationId'),
            messages_json[i].get('provinceShortName'))
        cityList.append(cityValueList)
print(cityList)

try:
    db.commit()
except:
    print('执行失败,进入回调1')
    db.rollback()

try:
    cursor.executemany(sql_world, worldtuple)
    db.commit()
except:
    print('执行失败,进入回调2')
    db.rollback()
try:
    db.commit()
except:
    print('执行失败,进入回调3')
    db.rollback()

try:
    db.commit()
except:
    print('执行失败,进入回调4')
    db.rollback()

db.close()