Python爬虫:常用翻译网站

2,434 阅读4分钟

基于Python的request库写了几个翻译网站的爬虫,现将源码分享出来,需要的小伙伴可以直接拿去用。代码已开源到GitHub:github.com/hy-struggle…,有兴趣的朋友可以一起交流和进步哦~

结果展示

金山词霸

金山词霸非常好爬,直接post几个参数就可以了。

# post请求
import json

import requests


class King:
    def __init__(self, word):
        self.word = word
        self.url = 'http://fy.iciba.com/ajax.php?a=fy'
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) '
                          'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'
        }
        # 构造post请求的参数
        self.post_data = {
            'f': 'auto',
            't': 'auto',
            'w': self.word
        }

    # 发送请求
    def request_post(self):
        res = requests.post(url=self.url, headers=self.headers, data=self.post_data)
        # print(res.content.decode())
        return res.content.decode()

    # 解析数据
    @staticmethod
    def parse_data(data):
        dict_data = json.loads(data)
        if 'out' in dict_data['content']:
            print(dict_data['content']['out'])
        elif 'word_mean' in dict_data['content']:
            print(dict_data['content']['word_mean'])

    def run(self):
        data = self.request_post()
        self.parse_data(data)


if __name__ == '__main__':
    word = input("翻译:")
    king = King(word)
    king.run()

必应翻译

必应的也很简单,只是需要多一个自动调post参数的函数,判断输入的是汉字还是英文。

# post请求
import json

import requests


class Biying:
    def __init__(self, word):
        self.word = word
        self.url = 'https://cn.bing.com/ttranslatev3?'
        # self.url = 'https://cn.bing.com/ttranslatev3?isVertical=1&&IG=E3F2E74779804936A4B134F621FE89FB&IID=translator.5028.12'
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) '
                          'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'
        }
        # 构造post请求的参数
        self.post_data = {
            'fromLang': 'auto-detect',
            'to': 'zh-Hans',
            'text': self.word
        }

    # 判断post参数
    def judge_post(self):
        if self.is_chinese(self.word):
            self.post_data['to'] = 'en'
            # print(self.word.encode().isalpha())

    # 判断是否为汉字
    @staticmethod
    def is_chinese(uchar):
        if u'\u4e00' <= uchar <= u'\u9fa5':
            return True
        else:
            return False

    # 发送请求
    def request_post(self):
        res = requests.post(url=self.url, headers=self.headers, data=self.post_data)
        # print(res.content.decode())
        return res.content.decode()

    # 解析数据
    @staticmethod
    def parse_data(data):
        dict_data = json.loads(data)
        print(dict_data[0]['translations'][0]['text'])

    def run(self):
        self.judge_post()
        data = self.request_post()
        self.parse_data(data)
        # dict_data = json.loads(data)
        # print(dict_data)


if __name__ == '__main__':
    word = input("翻译:")
    by = Biying(word)
    by.run()

百度翻译

百度的相对来说就有一些难度了,需要post的参数多了cookie,simple_means_flag,sign和token。其中的cookie参数和token参数是对应的,而sign参数需要调用execjs库通过网页的js解析出来。

# post请求
import json
import execjs
import requests

"""
    1.cookie参数和token参数是对应的
    2.生成sign参数需要通过调用baidu.js程序
"""


class Baidu:

    def __init__(self, word):
        self.word = word
        self.sign = self.get_sign()
        self.url = 'https://fanyi.baidu.com/v2transapi'
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) '
                          'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36',

            'cookie': 'BIDUPSID=EF5D2DCB95CD02713C504B965E680572; PSTM=1508391259; '
                      'BAIDUID=FE94A1C6870007735C0EA30CA092352A:FG=1; '
                      'BDUSS=HhpVTc3VjZrQ2ppRX5RcVFoQW9-WExTQ29zYWR-'
                      'TUluOUQxRGVaWHZrWGlOWmRkRVFBQUFBJCQAAAAAAAAAAAEAAAAUxiG2ZnJlZc31vNG'
                      '~pQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA'
                      'OKob13iqG9dW; locale=zh; __guid=37525047.783289347368707300.1568961749022.282; '
                      'REALTIME_TRANS_SWITCH=1; FANYI_WORD_SWITCH=1; HISTORY_SWITCH=1; SOUND_SPD_SWITCH=1; '
                      'SOUND_PREFER_SWITCH=1; to_lang_often=%5B%7B%22value%22%3A%22zh%22%2C%22text%22%3A%22%u'
                      '4E2D%u6587%22%7D%2C%7B%22value%22%3A%22en%22%2C%22text%22%3A%22%u82F1%u8BED%22%7D%5D;'
                      ' from_lang_often=%5B%7B%22value%22%3A%22en%22%2C%22text%22%3A%22%u82F1%u8BED%22%7D%2C'
                      '%7B%22value%22%3A%22zh%22%2C%22text%22%3A%22%u4E2D%u6587%22%7D%5D; yjs_js_security_pass'
                      'port=67080cbdf7d8d4ad0eb8f1513b5feb52c128c29b_1569324592_js; monitor_count=3; Hm_lvt_'
                      '64ecd82404c51e03dc91cb9e8c025574=1568961749,1569324577,1569324592,1569324674; Hm_lpvt_'
                      '64ecd82404c51e03dc91cb9e8c025574=1569324674; __yjsv5_shitong=1.0_7_9055159b9a5e975fcd2c2'
                      'c48931b3bc7b406_300_1569324677995_117.32.216.70_70981334'
        }

        # 构造post请求的参数
        self.post_data = {
            'from': 'en',
            'to': 'zh',
            'query': self.word,
            'simple_means_flag': '3',
            'sign': self.sign,
            'token': '8d588b57816e1213f2bcfaf52bddbbe2'
        }

    # 获取sign
    def get_sign(self):
        query = self.word  # 是要翻译的内容
        with open('baidu.js', 'r', encoding='utf-8') as f:
            ctx = execjs.compile(f.read())
        sign = ctx.call('e', query)
        # print(sign)
        return sign

    # 发送请求
    def request_post(self):
        res = requests.post(url=self.url, headers=self.headers, data=self.post_data)
        # print(res.content.decode())
        json_data = json.loads(res.content.decode())
        return json_data

    # 判断post参数
    def judge_post(self):
        if self.is_chinese(self.word):
            self.post_data['from'] = 'zh'
            self.post_data['to'] = 'en'
            # print(self.word.encode().isalpha())

    # 判断是否为汉字
    @staticmethod
    def is_chinese(uchar):
        if u'\u4e00' <= uchar <= u'\u9fa5':
            return True
        else:
            return False

    # 解析数据
    @staticmethod
    def parse_data(data):
        # dict_data = json.loads(data)
        print(data['trans_result']['data'][0]['dst'])

    def run(self):
        self.judge_post()
        json_data = self.request_post()
        self.parse_data(json_data)
        # print(data)


if __name__ == '__main__':
    word = input("翻译:")
    baidu = Baidu(word)
    baidu.run()

谷歌翻译

谷歌翻译与百度翻译又有不同,爬虫是通过get请求实现的。需要在url中加入token,而token的获取与百度爬虫一样,是通过js解析出来的。

# post请求
import json
import execjs
import requests


class Google:
    def __init__(self, word):
        self.word = word
        self.tk = self.get_tk()
        self.sl = 'en'
        self.tl = 'zh-CN'
        self.url = "http://translate.google.cn/translate_a/single?client=t" \
                   "&sl=%s&tl=%s&hl=zh-CN&dt=at&dt=bd&dt=ex&dt=ld&dt=md&dt=qca" \
                   "&dt=rw&dt=rm&dt=ss&dt=t&ie=UTF-8&oe=UTF-8&clearbtn=1&otf=1&pc=1" \
                   "&srcrom=0&ssel=0&tsel=0&kc=2&tk=%s&q=%s" % (self.sl, self.tl, self.tk, self.word)
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) '
                          'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'
        }

    # 判断是否为汉字
    @staticmethod
    def is_chinese(uchar):
        if u'\u4e00' <= uchar <= u'\u9fa5':
            return True
        else:
            return False

    # 判断url参数
    def judge_url(self):
        if self.is_chinese(self.word):
            self.sl = 'zh-CN'
            self.tl = 'en'
            self.url = "http://translate.google.cn/translate_a/single?client=t" \
                       "&sl=%s&tl=%s&hl=zh-CN&dt=at&dt=bd&dt=ex&dt=ld&dt=md&dt=qca" \
                       "&dt=rw&dt=rm&dt=ss&dt=t&ie=UTF-8&oe=UTF-8&clearbtn=1&otf=1&pc=1" \
                       "&srcrom=0&ssel=0&tsel=0&kc=2&tk=%s&q=%s" % (self.sl, self.tl, self.tk, self.word)
            # print(self.word.encode().isalpha())

    # 调用google.js获取tk
    def get_tk(self):
        query = self.word
        with open('google.js', 'r', encoding='utf-8') as f:
            ctx = execjs.compile(f.read())
        tk = ctx.call('TL', query)
        # print(sign)
        return tk

    # 发送请求
    def request_get(self):
        res = requests.get(url=self.url, headers=self.headers)
        # print(res.content.decode())
        json_data = json.loads(res.content.decode())
        return json_data

    # 解析数据
    @staticmethod
    def parse_data(data):
        print(data[0][0][0])

    def run(self):
        self.judge_url()
        # print(self.url)
        # print('sl:%s' % self.sl)
        # print('tl:%s' % self.tl)
        json_data = self.request_get()
        self.parse_data(json_data)
        # print(json_data)
        # self.parse_data(data)


if __name__ == '__main__':
    word = input("翻译:")
    google = Google(word)
    google.run()