Python urllib、urllib2使用

560 阅读2分钟
# -*- coding: UTF-8 -*-

# 导入urllib 库
import urllib

# 导入urllib2 库
import urllib2

keyword = raw_input('请输入搜索词>>>')

wd = {'wd':keyword}

#转换成url编码格式(字符串)
wd = urllib.urlencode(wd)

url = 'http://www.baidu.com/s?' + wd


# 向指定的url发送请求,并返回服务器响应的类文件对象
response = urllib2.urlopen(url)

# 类文件对象支持 文件对象的操作方法,如read()方法读取文件全部内容,返回字符串
html = response.read()

# 打印字符串
# print html

# 打开文件以便写入
f = open('./baidui.html','w')

#写入文件
print >> f , '%s'%html
#print("%s" %html , file = f) Python3写法


#关闭文件资源
f.close()

简单写一个小爬虫程序,来爬取百度LOL吧的所有网页。 先写一个main,提示用户输入要爬取的贴吧名,并用urllib.urlencode()进行转码,然后组合url,假设是lol吧,那么组合后的url就是:http://tieba.baidu.com/f?kw=lol

# -*- coding: UTF-8 -*-

import urllib

import urllib2

def loadPage(url,fname):

    print '正在下载。。。' + fname

    #headers = {
      #  'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36'}

    #request  =  urllib2.Request(url,headers= headers)

    response = urllib2.urlopen(url)

    html = response.read()

    return html


def saveFile(html,fname):

    print '正在保存。。。' + fname

    f = open('./' + fname, 'w')

    print >> f, html

    f.close()

if __name__ == '__main__':

    keyword = raw_input('请输入贴吧关键字>>>')

    beginPage = int(raw_input('请输入起始页:>>>'))

    endPage = int(raw_input('请输入结束页:>>>'))

    kw = urllib.urlencode({'kw':keyword})

    url = 'http://tieba.baidu.com/f?' + kw

    for i in range(beginPage,endPage+1):

        fname = 'no' + str(i) + '.html'

        pn = urllib.urlencode({'pn':i})

        url = url + '&' +  pn

        html = loadPage(url,fname)

        saveFile(html,fname)
        
        
        

有道翻译

# -*- coding: UTF-8 -*-

#有道翻译

import urllib2

import urllib

import time

import random

import hashlib

# url = "http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule"
url = "http://fanyi.youdao.com/translate_o?smartresult=dict&smartresult=rule"
keyword = raw_input('请输入翻译内容>>>')

u = 'fanyideskweb'

d = keyword

f = str(int(time.time()*1000) + random.randint(1,10))

c = 'rY0D^0\'nM0}g5Mm1z%1G4'

salt = f

sign = hashlib.md5((u + d + f + c)).hexdigest()

print  salt , sign

formdata = {
   'i' : keyword,
   'from':'AUTO',
   'to':'AUTO',
   'smartresult':'dict',
    'client':'fanyideskweb',
    'salt':salt,
    'sign':sign,
    'doctype':'json',
    'version':2.1,
    'keyfrom':'fanyi.web',
    'action':'FY_BY_CLICKBUTTION'
}

headers = {
  'User-Agent':	'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
  'Accept-Language':'zh-CN,zh;q=0.9',
  'X-Requested-With':'XMLHttpRequest',
  'Accept':'application/json, text/javascript, */*; q=0.01',
   'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8'

}

data = urllib.urlencode(formdata)

request = urllib2.Request(url,data=data,headers = headers)

response = urllib2.urlopen(request)

print response.read().decode('utf-8')
2.7