java + linux + selenium 模拟浏览器请求获取信息

1,053 阅读1分钟

场景

爬取微博用户首页信息时,需要携带cookie访问,于是想了个笨方法来获取cookie,也就是通过java集成selenium来模拟浏览器请求,从而获取到cookie

步骤

1、linux安装浏览器,以chrome为例

下载方式很多,自行百度也可,我是去官网下了包,根据linux操作系统选择相应包,用下面命令启动,报错的话,执行下 apt-get update,报错的最后一行会提示你怎么做,注意看下

sudo dpkg -i google-chrome-stable_current_amd64.deb 

启动成功后,查看下chrome版本,这个很关键

记住版本,去下载相应的驱动包 chromedriver.storage.googleapis.com/index.html

解压驱动,放在/usr/bin目录下,加上可执行权限

至此,linux准备工作完成了

2、maven依赖selenium包

 <dependency>
            <groupId>org.seleniumhq.selenium</groupId>
            <artifactId>selenium-java</artifactId>
            <version>3.4.0</version>
 </dependency>

3、java调用

  //System.setProperty("webdriver.chrome.driver", "/usr/bin/chromedriver");
        ChromeOptions chromeOptions = new ChromeOptions();
        //设置 chrome 的无头模式,没有gui的时候必须要设置
        chromeOptions.addArguments("headless");//很关键
        chromeOptions.addArguments("no-sandbox");//很关键
        WebDriver driver = new ChromeDriver(chromeOptions);
        driver.get("https://weibo.com/mikuya1031?is_hot=1");
        Thread.sleep(5000);//信息获取有延迟
        String result = driver.getPageSource();
        Set<Cookie> coo = driver.manage().getCookies();
        driver.close();
        return result + "---------------------------------------------" + JSON.toJSONString(coo);

4、效果

url内容:

执行结果:


<html><head>
<meta charset="utf-8">
<meta content="不2不叫周淑怡,不2不叫周淑怡的微博,微博,新浪微博,weibo" name="keywords">
<meta content="不2不叫周淑怡,英雄联盟官方解说 斗鱼直播平台签约主播 《我是唱作人》百人评审。不2不叫周淑怡的微博主页、个人资料、相册,上海七煌信息科技有限公司。新浪微博,随时随地分享身边的新鲜事儿。" name="description">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="renderer" content="webkit">
<meta name="viewport" content="initial-scale=1,minimum-scale=1">
<link rel="dns-prefetch" href="//img.t.sinajs.cn/">
<link rel="dns-prefetch" href="//img1.t.sinajs.cn/">
<link rel="dns-prefetch" href="//js.t.sinajs.cn/">
<link rel="dns-prefetch" href="//js1.t.sinajs.cn/">
<link rel="dns-prefetch" href="//js2.t.sinajs.cn/">
<link rel="dns-prefetch" href="//biz.weibo.com/">
<link rel="dns-prefetch" href="//beacon.sina.com.cn/">
<link rel="dns-prefetch" href="//rs.sinajs.cn/">
<link rel="dns-prefetch" href="//tp1.sinaimg.cn/">
<link rel="dns-prefetch" href="//tp2.sinaimg.cn/">
<link rel="dns-prefetch" href="//tp3.sinaimg.cn/">
<link rel="dns-prefetch" href="//tp4.sinaimg.cn/">
<link rel="dns-prefetch" href="//ww1.sinaimg.cn/">
<link rel="dns-prefetch" href="//ww2.sinaimg.cn/">
<link rel="dns-prefetch" href="//ww3.sinaimg.cn/">
<link rel="dns-prefetch" href="//ww4.sinaimg.cn/">

<link rel="mask-icon" sizes="any" href="//img.t.sinajs.cn/t6/style/images/apple/wbfont.svg" color="black">
<link rel="shortcut icon" type="image/x-icon" href="/favicon.ico">

<title>不2不叫周淑怡的微博_微博</title>
<link type="text/css" rel="stylesheet" charset="utf-8" href="//img.t.sinajs.cn/t6/style/css/module/base/frame.css?version=8360e4a841c8aaec" putoff="style/css/module/combination/extra.css?version=8360e4a841c8aaec">
<link type="text/css" rel="stylesheet" charset="utf-8" href="//img.t.sinajs.cn/t6/style/css/module/combination/PCD_profile_home_A.css?version=8360e4a841c8aaec" includes="style/css/module/pagecard/PCD_counter.css?version=8360e4a841c8aaec|style/css/module/pagecard/PCD_person_info.css?version=8360e4a841c8aaec|style/css/module/pagecard/PCD_user_a.css?version=8360e4a841c8aaec|style/css/module/pagecard/PCD_pictext_a.css?version=8360e4a841c8aaec|style/css/module/pagecard/PCD_piclist_a.css?version=8360e4a841c8aaec|style/css/module/pagecard/PCD_mydata.css?version=8360e4a841c8aaec|style/css/module/pagecard/PCD_photolist.css?version=8360e4a841c8aaec|style/css/module/list/comb_WB_feed_profile.css?version=8360e4a841c8aaec|style/css/module/global/WB_timeline.css?version=8360e4a841c8aaec|style/css/module/pagecard/PCD_profileme.css?version=8360e4a841c8aaec|style/css/module/tab/comb_WB_tab_profile.css?version=8360e4a841c8aaec|style/css/module/list/comb_webim.css?version=8360e4a841c8aaec">
	<link type="text/css" rel="stylesheet" charset="utf-8" href="//img.t.sinajs.cn/t6/skin/diy/skin.css?version=8360e4a841c8aaec">
	
<script type="text/javascript" charset="utf-8" async="" src="https://js.t.sinajs.cn/open/analytics/js/suda.js?version=44f46a2945abfbb8"></script><script type="text/javascript">
~function(l){var h = l.href, r = l.protocol +' //'+ l.host, i = h.indexOf('#!'), s = i != -1 ? h.substr(i + 2) : '';if ( (new RegExp("/[a-zA-Z0-9\\.\\-~!@#$%^&*+?:_/=<>]+", "gi")).test(s) ) l.replace(r + s);}(location)
</script>
<script type="text/javascript">
try{document.execCommand("BackgroundImageCache",
        false, true);
    }catch(e){}
</script>
<!-- $CONFIG -->
<script type="text/javascript">
var $CONFIG = {};
$CONFIG['islogin'
    ]='2'; 
$CONFIG['oid'
    ]='1890196401'; 
$CONFIG['page_id'
    ]='1005051890196401'; 
$CONFIG['onick'
    ]='不2不叫周淑怡'; 
$CONFIG['skin'
    ]='diy'; 
$CONFIG['background'
    ]='70aa1bb1gy1fyazl2776tj20u00u0adf'; 
$CONFIG['scheme'
    ]='diy007'; 
$CONFIG['colors_type'
    ]='0'; 
$CONFIG['uid'
    ]='3655689037'; 
$CONFIG['nick'
    ]='欢迎新用户'; 
$CONFIG['sex'
    ]='m'; 
$CONFIG['watermark'
    ]='u/3655689037'; 
$CONFIG['domain'
    ]='100505'; 
$CONFIG['lang'
    ]='zh-cn'; 
$CONFIG['avatar_large'
    ]='https: //tva1.sinaimg.cn/crop.0.0.179.179.180/d9e5634djw1east9pi6bej2050050dfw.jpg?KID=imgbed,tva&amp;Expires=1606237518&amp;ssig=iilDlEmK9X'; 
$CONFIG['timeDiff'
    ]=(new Date() - 1606226719000); 
$CONFIG['servertime'
    ]='1606226719'; 
$CONFIG['location'
    ]='page_100505_home'; 
$CONFIG['pageid'
    ]=''; 
$CONFIG['title_value'
    ]='不2不叫周淑怡的微博_微博'; 
$CONFIG['$webim'
    ]='1'; 
$CONFIG['miyou'
    ]='1'; 
$CONFIG['brand'
    ]='0'; 
$CONFIG['bigpipe'
    ]='true'; 
$CONFIG['bpType'
    ]='page'; 
$CONFIG['cssPath'
    ]=' //img.t.sinajs.cn/t6/'; 
$CONFIG['imgPath'
    ]=' //img.t.sinajs.cn/t6/'; 
$CONFIG['jsPath'
    ]=' //js.t.sinajs.cn/t6/'; 
$CONFIG['mJsPath'
    ]=[
        "//js{n}.t.sinajs.cn/t6/",
        1,
        2
    ]; 
$CONFIG['mCssPath'
    ]=[
        "//img{n}.t.sinajs.cn/t6/",
        1,
        2
    ]; 
$CONFIG['version'
    ]='44f46a2945abfbb8'; 
$CONFIG['g_mathematician'
    ]='1'; 
$CONFIG['isAuto'
    ]='0'; 
$CONFIG['timeweibo'
    ]='1'; 
$CONFIG['isVmember'
    ]='0'; 
$CONFIG['isKrMember'
    ]='0'; 
$CONFIG['isGoldenV'
    ]='0'; 
$CONFIG['pid'
    ]='100505'; 
</script>
<!-- / $CONFIG -->
<link rel="stylesheet" type="text/css" href="https://img.t.sinajs.cn/t4/appstyle/vip_v2/css/apps_PRF/v6fansclub/Pl_Third_RightClub.css?id=123456123?version=8360e4a841c8aaec" id="FM_160622671952813"><div style="position: absolute; top: -9999px;"><div id="js_http:__img.t.sinajs.cn_t4_appstyle_vip_v2_css_apps_PRF_v6fansclub_Pl_Third_RightClub"></div><div id="js_http:__img.t.sinajs.cn_t6_style_css_apps_PCD_event_WB_feed_spec_red2017"></div></div><link rel="stylesheet" type="text/css" href="//img.t.sinajs.cn/t6/style/css/module/pagecard/comb_uta_textc.css?version=8360e4a841c8aaec" id="FM_160622671952816"><link rel="stylesheet" type="text/css" href="https://img.t.sinajs.cn/t6/style/css/apps_PCD/event/WB_feed_spec_red2017.css?version=8360e4a841c8aaec" id="FM_160622671952821"><link rel="stylesheet" type="text/css" href="//img.t.sinajs.cn/t6/style/css/module/pagecard/PCD_pictext_f.css?version=8360e4a841c8aaec" id="FM_160622671952825"><link rel="stylesheet" type="text/css" href="//img.t.sinajs.cn/t6/style/css/module/pagecard/PCD_pictext_h.css?version=8360e4a841c8aaec" id="FM_160622671952827"><link rel="stylesheet" type="text/css" href="//img.t.sinajs.cn/t6/style/css/module/pagecard/PCD_feed.css?version=8360e4a841c8aaec" id="FM_160622671952830"><link rel="stylesheet" type="text/css" href="//img.t.sinajs.cn/t6/style/css/module/pagecard/PCD_ut_a.css?version=8360e4a841c8aaec" id="FM_160622671952836"><style></style><div style="position: absolute; top: -9999px; left: -9999px;"><div id="js_//img.t.sinajs.cn/t6/skin/diy/skin"></div></div><link rel="stylesheet" type="text/css" charset="utf-8" href="//img.t.sinajs.cn/t6/skin/diy/skin.css?version=8360e4a841c8aaec" id="skin_style"><style type="text/css" id="custom_style">            @import ' //img.t.sinajs.cn/t6/skin/diy/diy007/skin.css?version=40c445261b223d2b';
              body,.S_page{
               background-image:url(' //wx1.sinaimg.cn/woriginal/70aa1bb1gy1fyazl2776tj20u00u0adf.jpg');
               background-repeat    : repeat;
               background-attachment: scroll;
               background-position  : left top;
    }
        </style><style></style><style></style><link href="//img.t.sinajs.cn/t6/style/css/module/combination/extra.css?version=8360e4a841c8aaec" type="text/css" rel="stylesheet"><link rel="Stylesheet" type="text/css" charset="utf-8" href="//img.t.sinajs.cn/t6/style/css/module/pagecard/PCD_mplayer.css?version=44f46a2945abfbb8"></head>
<body class="FRAME_page B_page S_page">
  <div class="WB_miniblog">
    <div class="WB_miniblog_fb">
      <div id="pl_common_top">    <!--顶部导航--> 
    <div class="WB_global_nav WB_global_nav_v2 WB_global_nav_alpha ">
      <div class="gn_header clearfix">
        <div class="gn_logo" node-type="logo" data-logotype="logo" data-logourl="//weibo.com?topnav=1&amp;mod=logo">
	        <a href="//weibo.com?topnav=1&amp;mod=logo" class="box" title="" node-type="logolink" suda-uatrack="key=topnav_tab&amp;value=weibologo">
	            	            	<span class="logo"></span>
	            	        </a>
        </div>
        <div class=" gn_search_v2">
          <span class=" placeholder">搜索微博、找人</span><input type="text" node-type="searchInput" autocomplete="nope" value="" class="W_input S_bg1" name="16062267196598">
          <a href="javascript:void(0);" title="搜索" node-type="searchSubmit" class="W_ficon ficon_search S_ficon" suda-uatrack="key=topnav_tab&amp;value=search">f</a> 
          <!--搜索热词下拉-->
          <div class="gn_topmenulist_search" node-type="searchSuggest" style="display:none;">
             <div class="gn_topmenulist">
              <div node-type="basic"></div>
              <div node-type="plus"></div>
            </div>
          </div>
          <!--/搜索热词下拉--> 
        </div>       
        <div class="gn_position">
	        <div class="gn_nav">
		         <ul class="gn_nav_list">
		            <li><a href="//weibo.com" class="S_txt1" suda-uatrack="key=topnav_tab&amp;value=homepage"><em class="W_ficon ficon_home S_ficon">E</em><em class="S_txt1">首页</em></a></li>
                     <li><a href="https://weibo.com/tv" nm="tv" class="S_txt1" suda-uatrack="key=topnav_tab&amp;value=video"><em class="W_ficon ficon_wb_vb S_ficon"></em><em class="S_txt1">视频</em></a></li>
                     <li><a dot="pos55b9e1ad88ae4" href="//d.weibo.com/?topnav=1&amp;mod=logo" class="S_txt1" suda-uatrack="key=topnav_tab&amp;value=discover"><em class="W_ficon ficon_found S_ficon">F</em><em class="S_txt1">发现</em></a></li>
		            <li><a href="http://game.weibo.com?topnav=1&amp;mod=logo" class="S_txt1" suda-uatrack="key=topnav_tab&amp;value=game"><em class="W_ficon ficon_game S_ficon">G</em><em class="S_txt1">游戏</em></a></li>
		         </ul>
	        </div>
	        <div class="gn_login">
	          	<ul class="gn_login_list">
	            	<li><a href="//weibo.com/signup/signup.php" class="S_txt1">注册</a></li>
	            	<li class="W_vline S_line1"></li>
	            	<li><a node-type="loginBtn" href="javascript:void(0)" class="S_txt1">登录</a></li>
	          	</ul>
            </div>          
        </div>            
      </div>
    </div>
</div>
      	    <!-- 未登录注册邀请页 -->
	    <div id="pl_common_unlogininvitereg"></div>
	    <!-- /未登录注册邀请页 -->
	    <!-- 未登录base页 -->
	    <div id="pl_common_unloginbase"></div>
	    <!-- /未登录base页 -->
	    <!-- SUDA_CODE_START -->
<script type="text/javascript" charset="utf-8">
(function() {
	var doc = document,
		wa = doc.createElement('script'),
		s = doc.getElementsByTagName('script')[
            0
        ];
    wa.type = 'text/javascript';
    wa.charset = 'utf-8';
    wa.async = true;
    wa.src = ('https:' == doc.location.protocol ? 'https: //' : 'http://') + 'js.t.sinajs.cn/open/analytics/js/suda.js?version=44f46a2945abfbb8';
    s.parentNode.insertBefore(wa, s);
    })();
</script>
<noscript>
<img width="0" height="0" src="https://p3-juejin.byteimg.com/tos-cn-i-k3u1fbpfcp/2503c1b7cf424530a8cdf2e4fc35001a~tplv-k3u1fbpfcp-zoom-1.image" border="0" alt="" />
</noscript>
<!-- SUDA_CODE_END -->	          <div class="WB_main clearfix" id="plc_frame"><div class="WB_frame">
        
        	

            字数限制,不全部展示了