# -*- coding: UTF-8 -*-
# 导入urllib 库
import urllib
# 导入urllib2 库
import urllib2
keyword = raw_input('请输入搜索词>>>')
wd = {'wd':keyword}
#转换成url编码格式(字符串)
wd = urllib.urlencode(wd)
url = 'http://www.baidu.com/s?' + wd
# 向指定的url发送请求,并返回服务器响应的类文件对象
response = urllib2.urlopen(url)
# 类文件对象支持 文件对象的操作方法,如read()方法读取文件全部内容,返回字符串
html = response.read()
# 打印字符串
# print html
# 打开文件以便写入
f = open('./baidui.html','w')
#写入文件
print >> f , '%s'%html
#print("%s" %html , file = f) Python3写法
#关闭文件资源
f.close()
简单写一个小爬虫程序,来爬取百度LOL吧的所有网页。 先写一个main,提示用户输入要爬取的贴吧名,并用urllib.urlencode()进行转码,然后组合url,假设是lol吧,那么组合后的url就是:http://tieba.baidu.com/f?kw=lol
# -*- coding: UTF-8 -*-
import urllib
import urllib2
def loadPage(url,fname):
print '正在下载。。。' + fname
#headers = {
# 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36'}
#request = urllib2.Request(url,headers= headers)
response = urllib2.urlopen(url)
html = response.read()
return html
def saveFile(html,fname):
print '正在保存。。。' + fname
f = open('./' + fname, 'w')
print >> f, html
f.close()
if __name__ == '__main__':
keyword = raw_input('请输入贴吧关键字>>>')
beginPage = int(raw_input('请输入起始页:>>>'))
endPage = int(raw_input('请输入结束页:>>>'))
kw = urllib.urlencode({'kw':keyword})
url = 'http://tieba.baidu.com/f?' + kw
for i in range(beginPage,endPage+1):
fname = 'no' + str(i) + '.html'
pn = urllib.urlencode({'pn':i})
url = url + '&' + pn
html = loadPage(url,fname)
saveFile(html,fname)
有道翻译
# -*- coding: UTF-8 -*-
#有道翻译
import urllib2
import urllib
import time
import random
import hashlib
# url = "http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule"
url = "http://fanyi.youdao.com/translate_o?smartresult=dict&smartresult=rule"
keyword = raw_input('请输入翻译内容>>>')
u = 'fanyideskweb'
d = keyword
f = str(int(time.time()*1000) + random.randint(1,10))
c = 'rY0D^0\'nM0}g5Mm1z%1G4'
salt = f
sign = hashlib.md5((u + d + f + c)).hexdigest()
print salt , sign
formdata = {
'i' : keyword,
'from':'AUTO',
'to':'AUTO',
'smartresult':'dict',
'client':'fanyideskweb',
'salt':salt,
'sign':sign,
'doctype':'json',
'version':2.1,
'keyfrom':'fanyi.web',
'action':'FY_BY_CLICKBUTTION'
}
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
'Accept-Language':'zh-CN,zh;q=0.9',
'X-Requested-With':'XMLHttpRequest',
'Accept':'application/json, text/javascript, */*; q=0.01',
'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8'
}
data = urllib.urlencode(formdata)
request = urllib2.Request(url,data=data,headers = headers)
response = urllib2.urlopen(request)
print response.read().decode('utf-8')
2.7