场景
爬取微博用户首页信息时,需要携带cookie访问,于是想了个笨方法来获取cookie,也就是通过java集成selenium来模拟浏览器请求,从而获取到cookie
步骤
1、linux安装浏览器,以chrome为例
下载方式很多,自行百度也可,我是去官网下了包,根据linux操作系统选择相应包,用下面命令启动,报错的话,执行下 apt-get update,报错的最后一行会提示你怎么做,注意看下
sudo dpkg -i google-chrome-stable_current_amd64.deb
启动成功后,查看下chrome版本,这个很关键
记住版本,去下载相应的驱动包 chromedriver.storage.googleapis.com/index.html
解压驱动,放在/usr/bin目录下,加上可执行权限
至此,linux准备工作完成了
2、maven依赖selenium包
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-java</artifactId>
<version>3.4.0</version>
</dependency>
3、java调用
//System.setProperty("webdriver.chrome.driver", "/usr/bin/chromedriver");
ChromeOptions chromeOptions = new ChromeOptions();
//设置 chrome 的无头模式,没有gui的时候必须要设置
chromeOptions.addArguments("headless");//很关键
chromeOptions.addArguments("no-sandbox");//很关键
WebDriver driver = new ChromeDriver(chromeOptions);
driver.get("https://weibo.com/mikuya1031?is_hot=1");
Thread.sleep(5000);//信息获取有延迟
String result = driver.getPageSource();
Set<Cookie> coo = driver.manage().getCookies();
driver.close();
return result + "---------------------------------------------" + JSON.toJSONString(coo);
4、效果
url内容:
执行结果:
<html><head>
<meta charset="utf-8">
<meta content="不2不叫周淑怡,不2不叫周淑怡的微博,微博,新浪微博,weibo" name="keywords">
<meta content="不2不叫周淑怡,英雄联盟官方解说 斗鱼直播平台签约主播 《我是唱作人》百人评审。不2不叫周淑怡的微博主页、个人资料、相册,上海七煌信息科技有限公司。新浪微博,随时随地分享身边的新鲜事儿。" name="description">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="renderer" content="webkit">
<meta name="viewport" content="initial-scale=1,minimum-scale=1">
<link rel="dns-prefetch" href="//img.t.sinajs.cn/">
<link rel="dns-prefetch" href="//img1.t.sinajs.cn/">
<link rel="dns-prefetch" href="//js.t.sinajs.cn/">
<link rel="dns-prefetch" href="//js1.t.sinajs.cn/">
<link rel="dns-prefetch" href="//js2.t.sinajs.cn/">
<link rel="dns-prefetch" href="//biz.weibo.com/">
<link rel="dns-prefetch" href="//beacon.sina.com.cn/">
<link rel="dns-prefetch" href="//rs.sinajs.cn/">
<link rel="dns-prefetch" href="//tp1.sinaimg.cn/">
<link rel="dns-prefetch" href="//tp2.sinaimg.cn/">
<link rel="dns-prefetch" href="//tp3.sinaimg.cn/">
<link rel="dns-prefetch" href="//tp4.sinaimg.cn/">
<link rel="dns-prefetch" href="//ww1.sinaimg.cn/">
<link rel="dns-prefetch" href="//ww2.sinaimg.cn/">
<link rel="dns-prefetch" href="//ww3.sinaimg.cn/">
<link rel="dns-prefetch" href="//ww4.sinaimg.cn/">
<link rel="mask-icon" sizes="any" href="//img.t.sinajs.cn/t6/style/images/apple/wbfont.svg" color="black">
<link rel="shortcut icon" type="image/x-icon" href="/favicon.ico">
<title>不2不叫周淑怡的微博_微博</title>
<link type="text/css" rel="stylesheet" charset="utf-8" href="//img.t.sinajs.cn/t6/style/css/module/base/frame.css?version=8360e4a841c8aaec" putoff="style/css/module/combination/extra.css?version=8360e4a841c8aaec">
<link type="text/css" rel="stylesheet" charset="utf-8" href="//img.t.sinajs.cn/t6/style/css/module/combination/PCD_profile_home_A.css?version=8360e4a841c8aaec" includes="style/css/module/pagecard/PCD_counter.css?version=8360e4a841c8aaec|style/css/module/pagecard/PCD_person_info.css?version=8360e4a841c8aaec|style/css/module/pagecard/PCD_user_a.css?version=8360e4a841c8aaec|style/css/module/pagecard/PCD_pictext_a.css?version=8360e4a841c8aaec|style/css/module/pagecard/PCD_piclist_a.css?version=8360e4a841c8aaec|style/css/module/pagecard/PCD_mydata.css?version=8360e4a841c8aaec|style/css/module/pagecard/PCD_photolist.css?version=8360e4a841c8aaec|style/css/module/list/comb_WB_feed_profile.css?version=8360e4a841c8aaec|style/css/module/global/WB_timeline.css?version=8360e4a841c8aaec|style/css/module/pagecard/PCD_profileme.css?version=8360e4a841c8aaec|style/css/module/tab/comb_WB_tab_profile.css?version=8360e4a841c8aaec|style/css/module/list/comb_webim.css?version=8360e4a841c8aaec">
<link type="text/css" rel="stylesheet" charset="utf-8" href="//img.t.sinajs.cn/t6/skin/diy/skin.css?version=8360e4a841c8aaec">
<script type="text/javascript" charset="utf-8" async="" src="https://js.t.sinajs.cn/open/analytics/js/suda.js?version=44f46a2945abfbb8"></script><script type="text/javascript">
~function(l){var h = l.href, r = l.protocol +' //'+ l.host, i = h.indexOf('#!'), s = i != -1 ? h.substr(i + 2) : '';if ( (new RegExp("/[a-zA-Z0-9\\.\\-~!@#$%^&*+?:_/=<>]+", "gi")).test(s) ) l.replace(r + s);}(location)
</script>
<script type="text/javascript">
try{document.execCommand("BackgroundImageCache",
false, true);
}catch(e){}
</script>
<!-- $CONFIG -->
<script type="text/javascript">
var $CONFIG = {};
$CONFIG['islogin'
]='2';
$CONFIG['oid'
]='1890196401';
$CONFIG['page_id'
]='1005051890196401';
$CONFIG['onick'
]='不2不叫周淑怡';
$CONFIG['skin'
]='diy';
$CONFIG['background'
]='70aa1bb1gy1fyazl2776tj20u00u0adf';
$CONFIG['scheme'
]='diy007';
$CONFIG['colors_type'
]='0';
$CONFIG['uid'
]='3655689037';
$CONFIG['nick'
]='欢迎新用户';
$CONFIG['sex'
]='m';
$CONFIG['watermark'
]='u/3655689037';
$CONFIG['domain'
]='100505';
$CONFIG['lang'
]='zh-cn';
$CONFIG['avatar_large'
]='https: //tva1.sinaimg.cn/crop.0.0.179.179.180/d9e5634djw1east9pi6bej2050050dfw.jpg?KID=imgbed,tva&Expires=1606237518&ssig=iilDlEmK9X';
$CONFIG['timeDiff'
]=(new Date() - 1606226719000);
$CONFIG['servertime'
]='1606226719';
$CONFIG['location'
]='page_100505_home';
$CONFIG['pageid'
]='';
$CONFIG['title_value'
]='不2不叫周淑怡的微博_微博';
$CONFIG['$webim'
]='1';
$CONFIG['miyou'
]='1';
$CONFIG['brand'
]='0';
$CONFIG['bigpipe'
]='true';
$CONFIG['bpType'
]='page';
$CONFIG['cssPath'
]=' //img.t.sinajs.cn/t6/';
$CONFIG['imgPath'
]=' //img.t.sinajs.cn/t6/';
$CONFIG['jsPath'
]=' //js.t.sinajs.cn/t6/';
$CONFIG['mJsPath'
]=[
"//js{n}.t.sinajs.cn/t6/",
1,
2
];
$CONFIG['mCssPath'
]=[
"//img{n}.t.sinajs.cn/t6/",
1,
2
];
$CONFIG['version'
]='44f46a2945abfbb8';
$CONFIG['g_mathematician'
]='1';
$CONFIG['isAuto'
]='0';
$CONFIG['timeweibo'
]='1';
$CONFIG['isVmember'
]='0';
$CONFIG['isKrMember'
]='0';
$CONFIG['isGoldenV'
]='0';
$CONFIG['pid'
]='100505';
</script>
<!-- / $CONFIG -->
<link rel="stylesheet" type="text/css" href="https://img.t.sinajs.cn/t4/appstyle/vip_v2/css/apps_PRF/v6fansclub/Pl_Third_RightClub.css?id=123456123?version=8360e4a841c8aaec" id="FM_160622671952813"><div style="position: absolute; top: -9999px;"><div id="js_http:__img.t.sinajs.cn_t4_appstyle_vip_v2_css_apps_PRF_v6fansclub_Pl_Third_RightClub"></div><div id="js_http:__img.t.sinajs.cn_t6_style_css_apps_PCD_event_WB_feed_spec_red2017"></div></div><link rel="stylesheet" type="text/css" href="//img.t.sinajs.cn/t6/style/css/module/pagecard/comb_uta_textc.css?version=8360e4a841c8aaec" id="FM_160622671952816"><link rel="stylesheet" type="text/css" href="https://img.t.sinajs.cn/t6/style/css/apps_PCD/event/WB_feed_spec_red2017.css?version=8360e4a841c8aaec" id="FM_160622671952821"><link rel="stylesheet" type="text/css" href="//img.t.sinajs.cn/t6/style/css/module/pagecard/PCD_pictext_f.css?version=8360e4a841c8aaec" id="FM_160622671952825"><link rel="stylesheet" type="text/css" href="//img.t.sinajs.cn/t6/style/css/module/pagecard/PCD_pictext_h.css?version=8360e4a841c8aaec" id="FM_160622671952827"><link rel="stylesheet" type="text/css" href="//img.t.sinajs.cn/t6/style/css/module/pagecard/PCD_feed.css?version=8360e4a841c8aaec" id="FM_160622671952830"><link rel="stylesheet" type="text/css" href="//img.t.sinajs.cn/t6/style/css/module/pagecard/PCD_ut_a.css?version=8360e4a841c8aaec" id="FM_160622671952836"><style></style><div style="position: absolute; top: -9999px; left: -9999px;"><div id="js_//img.t.sinajs.cn/t6/skin/diy/skin"></div></div><link rel="stylesheet" type="text/css" charset="utf-8" href="//img.t.sinajs.cn/t6/skin/diy/skin.css?version=8360e4a841c8aaec" id="skin_style"><style type="text/css" id="custom_style"> @import ' //img.t.sinajs.cn/t6/skin/diy/diy007/skin.css?version=40c445261b223d2b';
body,.S_page{
background-image:url(' //wx1.sinaimg.cn/woriginal/70aa1bb1gy1fyazl2776tj20u00u0adf.jpg');
background-repeat : repeat;
background-attachment: scroll;
background-position : left top;
}
</style><style></style><style></style><link href="//img.t.sinajs.cn/t6/style/css/module/combination/extra.css?version=8360e4a841c8aaec" type="text/css" rel="stylesheet"><link rel="Stylesheet" type="text/css" charset="utf-8" href="//img.t.sinajs.cn/t6/style/css/module/pagecard/PCD_mplayer.css?version=44f46a2945abfbb8"></head>
<body class="FRAME_page B_page S_page">
<div class="WB_miniblog">
<div class="WB_miniblog_fb">
<div id="pl_common_top"> <!--顶部导航-->
<div class="WB_global_nav WB_global_nav_v2 WB_global_nav_alpha ">
<div class="gn_header clearfix">
<div class="gn_logo" node-type="logo" data-logotype="logo" data-logourl="//weibo.com?topnav=1&mod=logo">
<a href="//weibo.com?topnav=1&mod=logo" class="box" title="" node-type="logolink" suda-uatrack="key=topnav_tab&value=weibologo">
<span class="logo"></span>
</a>
</div>
<div class=" gn_search_v2">
<span class=" placeholder">搜索微博、找人</span><input type="text" node-type="searchInput" autocomplete="nope" value="" class="W_input S_bg1" name="16062267196598">
<a href="javascript:void(0);" title="搜索" node-type="searchSubmit" class="W_ficon ficon_search S_ficon" suda-uatrack="key=topnav_tab&value=search">f</a>
<!--搜索热词下拉-->
<div class="gn_topmenulist_search" node-type="searchSuggest" style="display:none;">
<div class="gn_topmenulist">
<div node-type="basic"></div>
<div node-type="plus"></div>
</div>
</div>
<!--/搜索热词下拉-->
</div>
<div class="gn_position">
<div class="gn_nav">
<ul class="gn_nav_list">
<li><a href="//weibo.com" class="S_txt1" suda-uatrack="key=topnav_tab&value=homepage"><em class="W_ficon ficon_home S_ficon">E</em><em class="S_txt1">首页</em></a></li>
<li><a href="https://weibo.com/tv" nm="tv" class="S_txt1" suda-uatrack="key=topnav_tab&value=video"><em class="W_ficon ficon_wb_vb S_ficon"></em><em class="S_txt1">视频</em></a></li>
<li><a dot="pos55b9e1ad88ae4" href="//d.weibo.com/?topnav=1&mod=logo" class="S_txt1" suda-uatrack="key=topnav_tab&value=discover"><em class="W_ficon ficon_found S_ficon">F</em><em class="S_txt1">发现</em></a></li>
<li><a href="http://game.weibo.com?topnav=1&mod=logo" class="S_txt1" suda-uatrack="key=topnav_tab&value=game"><em class="W_ficon ficon_game S_ficon">G</em><em class="S_txt1">游戏</em></a></li>
</ul>
</div>
<div class="gn_login">
<ul class="gn_login_list">
<li><a href="//weibo.com/signup/signup.php" class="S_txt1">注册</a></li>
<li class="W_vline S_line1"></li>
<li><a node-type="loginBtn" href="javascript:void(0)" class="S_txt1">登录</a></li>
</ul>
</div>
</div>
</div>
</div>
</div>
<!-- 未登录注册邀请页 -->
<div id="pl_common_unlogininvitereg"></div>
<!-- /未登录注册邀请页 -->
<!-- 未登录base页 -->
<div id="pl_common_unloginbase"></div>
<!-- /未登录base页 -->
<!-- SUDA_CODE_START -->
<script type="text/javascript" charset="utf-8">
(function() {
var doc = document,
wa = doc.createElement('script'),
s = doc.getElementsByTagName('script')[
0
];
wa.type = 'text/javascript';
wa.charset = 'utf-8';
wa.async = true;
wa.src = ('https:' == doc.location.protocol ? 'https: //' : 'http://') + 'js.t.sinajs.cn/open/analytics/js/suda.js?version=44f46a2945abfbb8';
s.parentNode.insertBefore(wa, s);
})();
</script>
<noscript>
<img width="0" height="0" src="https://p3-juejin.byteimg.com/tos-cn-i-k3u1fbpfcp/2503c1b7cf424530a8cdf2e4fc35001a~tplv-k3u1fbpfcp-zoom-1.image" border="0" alt="" />
</noscript>
<!-- SUDA_CODE_END --> <div class="WB_main clearfix" id="plc_frame"><div class="WB_frame">
字数限制,不全部展示了