基于Python的request库写了几个翻译网站的爬虫,现将源码分享出来,需要的小伙伴可以直接拿去用。代码已开源到GitHub:github.com/hy-struggle…,有兴趣的朋友可以一起交流和进步哦~
结果展示

金山词霸
金山词霸非常好爬,直接post几个参数就可以了。
# post请求
import json
import requests
class King:
def __init__(self, word):
self.word = word
self.url = 'http://fy.iciba.com/ajax.php?a=fy'
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'
}
# 构造post请求的参数
self.post_data = {
'f': 'auto',
't': 'auto',
'w': self.word
}
# 发送请求
def request_post(self):
res = requests.post(url=self.url, headers=self.headers, data=self.post_data)
# print(res.content.decode())
return res.content.decode()
# 解析数据
@staticmethod
def parse_data(data):
dict_data = json.loads(data)
if 'out' in dict_data['content']:
print(dict_data['content']['out'])
elif 'word_mean' in dict_data['content']:
print(dict_data['content']['word_mean'])
def run(self):
data = self.request_post()
self.parse_data(data)
if __name__ == '__main__':
word = input("翻译:")
king = King(word)
king.run()
必应翻译
必应的也很简单,只是需要多一个自动调post参数的函数,判断输入的是汉字还是英文。
# post请求
import json
import requests
class Biying:
def __init__(self, word):
self.word = word
self.url = 'https://cn.bing.com/ttranslatev3?'
# self.url = 'https://cn.bing.com/ttranslatev3?isVertical=1&&IG=E3F2E74779804936A4B134F621FE89FB&IID=translator.5028.12'
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'
}
# 构造post请求的参数
self.post_data = {
'fromLang': 'auto-detect',
'to': 'zh-Hans',
'text': self.word
}
# 判断post参数
def judge_post(self):
if self.is_chinese(self.word):
self.post_data['to'] = 'en'
# print(self.word.encode().isalpha())
# 判断是否为汉字
@staticmethod
def is_chinese(uchar):
if u'\u4e00' <= uchar <= u'\u9fa5':
return True
else:
return False
# 发送请求
def request_post(self):
res = requests.post(url=self.url, headers=self.headers, data=self.post_data)
# print(res.content.decode())
return res.content.decode()
# 解析数据
@staticmethod
def parse_data(data):
dict_data = json.loads(data)
print(dict_data[0]['translations'][0]['text'])
def run(self):
self.judge_post()
data = self.request_post()
self.parse_data(data)
# dict_data = json.loads(data)
# print(dict_data)
if __name__ == '__main__':
word = input("翻译:")
by = Biying(word)
by.run()
百度翻译
百度的相对来说就有一些难度了,需要post的参数多了cookie,simple_means_flag,sign和token。其中的cookie参数和token参数是对应的,而sign参数需要调用execjs库通过网页的js解析出来。
# post请求
import json
import execjs
import requests
"""
1.cookie参数和token参数是对应的
2.生成sign参数需要通过调用baidu.js程序
"""
class Baidu:
def __init__(self, word):
self.word = word
self.sign = self.get_sign()
self.url = 'https://fanyi.baidu.com/v2transapi'
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36',
'cookie': 'BIDUPSID=EF5D2DCB95CD02713C504B965E680572; PSTM=1508391259; '
'BAIDUID=FE94A1C6870007735C0EA30CA092352A:FG=1; '
'BDUSS=HhpVTc3VjZrQ2ppRX5RcVFoQW9-WExTQ29zYWR-'
'TUluOUQxRGVaWHZrWGlOWmRkRVFBQUFBJCQAAAAAAAAAAAEAAAAUxiG2ZnJlZc31vNG'
'~pQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA'
'OKob13iqG9dW; locale=zh; __guid=37525047.783289347368707300.1568961749022.282; '
'REALTIME_TRANS_SWITCH=1; FANYI_WORD_SWITCH=1; HISTORY_SWITCH=1; SOUND_SPD_SWITCH=1; '
'SOUND_PREFER_SWITCH=1; to_lang_often=%5B%7B%22value%22%3A%22zh%22%2C%22text%22%3A%22%u'
'4E2D%u6587%22%7D%2C%7B%22value%22%3A%22en%22%2C%22text%22%3A%22%u82F1%u8BED%22%7D%5D;'
' from_lang_often=%5B%7B%22value%22%3A%22en%22%2C%22text%22%3A%22%u82F1%u8BED%22%7D%2C'
'%7B%22value%22%3A%22zh%22%2C%22text%22%3A%22%u4E2D%u6587%22%7D%5D; yjs_js_security_pass'
'port=67080cbdf7d8d4ad0eb8f1513b5feb52c128c29b_1569324592_js; monitor_count=3; Hm_lvt_'
'64ecd82404c51e03dc91cb9e8c025574=1568961749,1569324577,1569324592,1569324674; Hm_lpvt_'
'64ecd82404c51e03dc91cb9e8c025574=1569324674; __yjsv5_shitong=1.0_7_9055159b9a5e975fcd2c2'
'c48931b3bc7b406_300_1569324677995_117.32.216.70_70981334'
}
# 构造post请求的参数
self.post_data = {
'from': 'en',
'to': 'zh',
'query': self.word,
'simple_means_flag': '3',
'sign': self.sign,
'token': '8d588b57816e1213f2bcfaf52bddbbe2'
}
# 获取sign
def get_sign(self):
query = self.word # 是要翻译的内容
with open('baidu.js', 'r', encoding='utf-8') as f:
ctx = execjs.compile(f.read())
sign = ctx.call('e', query)
# print(sign)
return sign
# 发送请求
def request_post(self):
res = requests.post(url=self.url, headers=self.headers, data=self.post_data)
# print(res.content.decode())
json_data = json.loads(res.content.decode())
return json_data
# 判断post参数
def judge_post(self):
if self.is_chinese(self.word):
self.post_data['from'] = 'zh'
self.post_data['to'] = 'en'
# print(self.word.encode().isalpha())
# 判断是否为汉字
@staticmethod
def is_chinese(uchar):
if u'\u4e00' <= uchar <= u'\u9fa5':
return True
else:
return False
# 解析数据
@staticmethod
def parse_data(data):
# dict_data = json.loads(data)
print(data['trans_result']['data'][0]['dst'])
def run(self):
self.judge_post()
json_data = self.request_post()
self.parse_data(json_data)
# print(data)
if __name__ == '__main__':
word = input("翻译:")
baidu = Baidu(word)
baidu.run()
谷歌翻译
谷歌翻译与百度翻译又有不同,爬虫是通过get请求实现的。需要在url中加入token,而token的获取与百度爬虫一样,是通过js解析出来的。
# post请求
import json
import execjs
import requests
class Google:
def __init__(self, word):
self.word = word
self.tk = self.get_tk()
self.sl = 'en'
self.tl = 'zh-CN'
self.url = "http://translate.google.cn/translate_a/single?client=t" \
"&sl=%s&tl=%s&hl=zh-CN&dt=at&dt=bd&dt=ex&dt=ld&dt=md&dt=qca" \
"&dt=rw&dt=rm&dt=ss&dt=t&ie=UTF-8&oe=UTF-8&clearbtn=1&otf=1&pc=1" \
"&srcrom=0&ssel=0&tsel=0&kc=2&tk=%s&q=%s" % (self.sl, self.tl, self.tk, self.word)
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'
}
# 判断是否为汉字
@staticmethod
def is_chinese(uchar):
if u'\u4e00' <= uchar <= u'\u9fa5':
return True
else:
return False
# 判断url参数
def judge_url(self):
if self.is_chinese(self.word):
self.sl = 'zh-CN'
self.tl = 'en'
self.url = "http://translate.google.cn/translate_a/single?client=t" \
"&sl=%s&tl=%s&hl=zh-CN&dt=at&dt=bd&dt=ex&dt=ld&dt=md&dt=qca" \
"&dt=rw&dt=rm&dt=ss&dt=t&ie=UTF-8&oe=UTF-8&clearbtn=1&otf=1&pc=1" \
"&srcrom=0&ssel=0&tsel=0&kc=2&tk=%s&q=%s" % (self.sl, self.tl, self.tk, self.word)
# print(self.word.encode().isalpha())
# 调用google.js获取tk
def get_tk(self):
query = self.word
with open('google.js', 'r', encoding='utf-8') as f:
ctx = execjs.compile(f.read())
tk = ctx.call('TL', query)
# print(sign)
return tk
# 发送请求
def request_get(self):
res = requests.get(url=self.url, headers=self.headers)
# print(res.content.decode())
json_data = json.loads(res.content.decode())
return json_data
# 解析数据
@staticmethod
def parse_data(data):
print(data[0][0][0])
def run(self):
self.judge_url()
# print(self.url)
# print('sl:%s' % self.sl)
# print('tl:%s' % self.tl)
json_data = self.request_get()
self.parse_data(json_data)
# print(json_data)
# self.parse_data(data)
if __name__ == '__main__':
word = input("翻译:")
google = Google(word)
google.run()