Python爬虫之JavaScript逆向,喜马拉雅加密算法分析

1,594 阅读6分钟

前言

这几天一直听听评书,发现喜马拉雅上的资源很多,不过很可惜都是付费的,所以我冲了一个月会员,简单写个爬虫,爬下来几10部,够我一年听的了

开始分析

打开chrome控制台,点击播放,最先拿到的一个接口就是

私信小编01即可获取大量Python学习资源

https://mpay.ximalaya.com/mobile/track/pay/244130607/?device=pc

当然这个是付费的一部书,所以如果你浏览器不带 会员的cookie是访问不到的,其中的数字 244130607,这个在他们的接口中叫做 trackId, 每个音频文件对应唯一的一个 trackId

Python爬虫之JavaScript逆向,喜马拉雅加密算法分析

也就是对应这个界面的后面的数字,通过这个唯一的trackId可以获取到音频文件,那么看一下这个接口返回的内容.最后,如果你的时间不是很紧张,并且又想快速的提高,最重要的是不怕吃苦,建议你可以联系维:762459510 ,那个真的很不错,很多人进步都很快,需要你不怕吃苦哦!大家可以去添加上看一下~

{
"ret": 0,"msg": "0","trackId": 244130607,
"uid": 170217760,"albumId": 30816438,"title": "
《三体》第一季 第十集 聚会与大撕裂","domain": "http://audiopay.cos.xmcdn.com","
totalLength": 12780565,"sampleDuration": 0,"
sampleLength": 0,
"isAuthorized": true,"apiVersion": "1.0.0","
seed": 9583,"fileId": "27*31*44*62*1*8*6*48*
52*4*6*17*16*6*35*35*
6*43*25*27*48*63*58*4*50*47*60*64*15*39*59*49*
2*36*48*48*16*58*18*44*2*32*12*7*52*64*51*26*29*4*22*","
buyKey": "617574686f72697a6564","
duration": 1578,"ep": 
"20NvOoh6T39X3qwKO4cY5g5bVhg+1nfPHIQafFTmCXihnrqF2P
jczO8O0auK1KJhDrJ30XMYfKJo2uz+xgwd3rwRPi5f","highestQualityLevel": 1,"
downloadQualityLevel": 1,"
authorizedType": 1}

这里,我充会员了,所以可以直接用浏览器中打开这个url,其中有用的字段有了只有几个 seed和 fileId两个通过js加密算法计算出 m4a的路径,并拼接主域名,然后 ep 经过另一个加密算法得到url的访问参数buy_key sign token timestamp,最后将它们拼接到一起才是一个完整的 音频的url

两个js加密算法

经过我调试我分别找到了这两个加密的 js算法

  1. 计算 m4a的路径js算法:
function vt(t) {                
this._randomSeed = t,                
this.cg_hun()            
}            
vt.prototype = 
{                
cg_hun: function() 
{                   
this._cgStr = "";                    
var t = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890"                      ,
e = t.length                      
, n = 0;                    
for (n = 0; n < e; n++) {                        
var r = this.ran() * t.length                          
, o = parseInt(r);                        
this._cgStr += t.charAt(o),                        
t = t.split(t.charAt(o)).join("")                    
}                
},                
cg_fun: function(t) 
{                    
t = t.split("*");                    
var e = ""                      
, n = 0;                    
for (n = 0; n < t.length - 1; n++)                        
e += this._cgStr.charAt(t[n]);                   
return e                
},                
ran: function() 
{                    
this._randomSeed = (211 * this._randomSeed + 30031) % 65536;                    
return this._randomSeed / 65536                
},            
};c = function(t, e) 
{    
var n = new vt(t).cg_fun(e);    
return "/" === n[0] ? n : "/".concat(n)}
console.log(c(9583,"27*31*44*62*1*8*6*48*52*4*
6*17*16*6*35*35*6*43*25*27*48*63*58*4*50*47*60*6
4*15*39*59*49*2*36*48*48*16*58*18*44*2*32*12*7*52*64*51*26*29*4*22*"))

node跑一下可以得到 m4a的路径
输出

/group3/M04/9E/88/wKgMbF4ejn2TfGPRAMMEFYoRHXs027.m4a
  1. 通过ep来计算url参数的js算法:
Z = function() {                
throw new TypeError("Invalid attempt to destructure non-iterable instance")            
}J = function(t, e) 
{
var n = []  , r = !0  , o = !1  ,
i = void 0;try {    
for (var a, u = t[Symbol.iterator]();
!(r = (a = u.next()).done) && (n.push(a.value),    
!e || n.length !== e); r = !0)        
;} catch (t) {    
o = !0,    i = t
} 
finally 
{    
try {        
r || null == u.return || u.return()    
} 
finally {        
if (o)            
throw i    
}
}
return n
}
Q = function(t) 
{
if (Array.isArray(t))   
return t}tt = function(t, e) {    
return Q(t) || J(t, e) || Z()}function yt(t, e) 
{    
for (var n, r = [], o = 0, i = "", a = 0; 256 > a; a++)        
r[a] = a;    
for (a = 0; 256 > a; a++)        
o = (o + r[a] + t.charCodeAt(a % t.length)) % 256,        
n = r[a],       
r[a] = r[o],        
r[o] = n;    
for (var u = o = a = 0; u < e.length; u++)        
o = (o + r[a = (a + 1) % 256]) % 256,        
n = r[a],        
r[a] = r[o],        
r[o] = n,        
i += String.fromCharCode(e.charCodeAt(u) ^ r[(r[a] + r[o]) % 256]);    
return i}var mt = yt("xm", "Ä[üJ=†Û3áf÷N") 
gt = [19, 1, 4, 7, 30, 14, 28, 8, 24, 17, 6,
35, 34, 16, 9, 10, 13, 22, 32, 29, 31, 21, 18,
3, 2, 23, 25, 27, 11, 20, 5, 15, 12, 0, 33, 26]
bt = function(t) 
{
var e1 = yt(    
function(t, e) 
{    
for (var n = [], r = 0; 
r < t.length; r++) 
{        
for (var o = "a" <= t[r] && "z" >= t[r] ? t[r]
.charCodeAt() - 97 : t[r].charCodeAt() - "0"
.charCodeAt() + 26, i = 0; 36 > i; i++)            
if (e[i] == o) 
{                
o = i;                
break            
}        
n[r] = 25 < o ? String.fromCharCode(o - 26 + "0".charCodeAt()) :
String.fromCharCode(o + 97)    
}    
return n.join("")    
}("d" + mt + "9",gt)    ,    
e2 = function(t) 
{        
if (!t)            
return "";        
var e, n, r, o, i, a = [-1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, 
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 
-1, -1, -1, -1, 62, -1, -1, -1, 63, 52, 53, 54, 55, 
56, 57, 58, 59, 60, 61, -1, -1, -1, -1, 
-1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 
23, 24, 25, -1, -1, -1, -1, -1, -1, 26, 27, 28, 29, 
30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -1, -1];        
for (o = (t = t.toString()).length,        
r = 0,        
i = ""; r < o; ) 
{            
do 
{                
e = a[255 & t.charCodeAt(r++)]            
} 
while (r < o && -1 == e);if (-1 == e)                
break;            
do {                
n = a[255 & t.charCodeAt(r++)]            
} 
while (r < o && -1 == n);if (-1 == n)                
break;            
i += String.fromCharCode(e << 2 | (48 & n) >> 4);            
do {                
if (61 == (e = 255 & t.charCodeAt(r++)))                    
return i;                
e = a[e]            
} 
while (r < o && -1 == e);if (-1 == e)                
break;            
i += String.fromCharCode((15 & n) << 4 | (60 & e) >> 2);            
do 
{                
if (61 == (n = 255 & t.charCodeAt(r++)))                    
return i;                
n = a[n]            
} 
while (r < o && -1 == n);if (-1 == n)                
break;            
i += String.fromCharCode((3 & e) << 6 | n)        
}        
return i    
}(t)    
)
.split("-")console.log(e1)
}
var c = bt("20NvOoh6T39X3qwKO4cY5g5bVhg+1nfPHIQafFTmCXihnrqF2PjczO8O0auK1KJhDrJ30XMYfKJo2uz+xgwd3rwRPi5f")

这段js比较复杂,调试的时候坑死我了,不在同一个地方,导致我来回复制,最终才把这个算法整理到这一个js文件中,依然用 node跑一下,输出:

[  '617574686f72697a6564',  'ef9a0678d77870843ef203d6333ce021',  '
5790',  '1598533668']

这几个参数分别对应的是:buy_key sign token timestamp
有了这两个js算法就可以完全地解析 这个接口返回的参数了。最后,如果你的时间不是很紧张,并且又想快速的提高,最重要的是不怕吃苦,建议你可以联系维:762459510 ,那个真的很不错,很多人进步都很快,需要你不怕吃苦哦!大家可以去添加上看一下~

python 代码仿写加密算法

  1. 计算 m4a路径加密算法
class vt():    
def __init__(self,t):        
self._randomSeed = t        
self.cg_hun()    
def ran(self):        
self._randomSeed = (211 * self._randomSeed + 30031) % 65536        
return self._randomSeed / 65536    
def cg_hun(self):        
self._cgStr = ""        
t = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890"        
e = len(t)        
n = 0        
for i in range(e):            
r = self.ran() * len(t)            
o = int(r)            
self._cgStr += t[o]            
t = "".join(t.split(t[o]))    
def cg_fun(self,t):        
t = [int(i) if i else 0 for i in t.split("*")]        
e = ""        
n = 0;        
for n in range(n,len(t)-1):            
e += self._cgStr[t[n]]        
return edef path_decode(seed,fileId):    
c = vt(seed)    
p = c.cg_fun(fileId)    
return p if __name__ == '__main__':    
result = path_decode(9583,"27*31*44*62*1*8*6*48*52*4*6*17*16*6
*35*35*6*43*25*27*48*63*58*4*50*47*60*64*15*39*59*49*2*36*48
*48*16*58*18*44*2*32*12*7*52*64*51*26*29*4*22*")    
print(result)
  1. 通过ep来计算url参数的算法:
def yt(t, e):    
r = [0 for i in range(256)]    
o = 0    
i = ""    
for a in range(0,256):        
r[a] = a;    
for a in range(0,256):        
o = (o + r[a] + ord(t[a % len(t)])) % 256        
n = r[a]        
r[a] = r[o]        
r[o] = n    
u = 0    
o = 0    
a = 0    
for u in range(0,len(e)):        
a = (a + 1) % 256        
o = (o + r[a]) % 256        
n = r[a]        
r[a] = r[o]        
r[o] = n        
i += chr(ord(e[u]) ^ r[(r[a] + r[o]) % 256])    
return idef bt(t):    
def arg1(t,e):        
n = [' ' for i in range(256)]        
for r in range(0,len(t)):            
if "a" <= t[r] and "z" >= t[r]:                
o = ord(t[r]) - 97             
else:                
o = ord(t[r]) - ord("0") + 26            
for i in range(0,36):                
if (e[i] == o):                    
o = i                    
break            
if 25< o:                
n[r] = chr(o - 26 + ord("0"))             
else:                
n[r] = chr(o + 97)        
return "".join(n).strip()    
a1 = arg1("d" + mt + "9", gt)    
def arg2(t):        
if not t:            
return ""        
e = n = r = o = i = a = [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, 62, -1, -1, -1, 63, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 
-1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, 
-1, -1, -1, -1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -1, -1];        
o = len(t)        
i = ""        
r = 0        
while r < o:            
while True:                
e = a[255 & ord(t[r])]                
r += 1                
if not (r < o and -1 == e):                    
break            
if (-1 == e):                
break            
while True:                
n = a[255 & ord(t[r])]                
r += 1                
if not (r < o and -1 == n):                    
break            
if (-1 == n):                
break            
i += chr(e << 2 | (48 & n) >> 4)            
while True:                
e = (255 & ord(t[r]))                
if 61 == e:                    
return i                
r += 1                
e = a[e]                
if not (r < o and -1 == e):                    
break            
if (-1 == e):                
break            
i += chr((15 & n) << 4 | (60 & e) >> 2);            
while True:                
n = (255 & ord(t[r]))                
if (61 == n):                    
return i                
r += 1                
n = a[n]                
if not (r < o and -1 == n):                    
break            
if (-1 == n):                
break            
i += chr((3 & e) << 6 | n)        
return i   
a2 = arg2(t)    
buy_key,sign,token,timestamp = yt(a1,a2).split('-')    
data = dict(        
buy_key=buy_key,        
sign=sign,        token=token,        
timestamp=timestamp,    )    
return datamt = yt("xm", "Ä[üJ=†Û3áf÷N")
gt = [19, 1, 4, 7, 30, 14, 28, 8, 24, 17, 6, 35, 34, 
16, 9, 10, 13, 22, 32, 29, 31, 21, 18, 3, 2, 23, 25, 
27, 11, 20, 5, 15, 12, 0, 33, 26]
def ep_decode(ep):    
data = bt(ep)    
return dataif __name__ == '__main__':    
print(ep_decode('20NvOoh6T39X3qwKO4cY5g5bVhg+1nfPHIQafFTmCXihnrqF2PjczO8O0auK1KJhDrJ30XMYfKJo2uz+xgwd3rwRPi5f'))

这个接口到此为止才算是完全可以解析。

免费接口分析

如果你没有充会员,免费的音频还是可以听的,我找到一个免费音频的接口

https://www.ximalaya.com/revision/play/v1/audio?id=324681559&ptype=1
{"ret": 200,"data": {"trackId": 324681559,"canPlay": true,"isPaid": false,"hasBuy": true,"src": "https://aod.cos.tx.xmcdn.com/group84/M03/4A/A6/wKg5Hl8s0cTwcp6xABQ0EbeuW5Q193.m4a","albumIsSample": false,"sampleDuration": 48,"isBaiduMusic": false,"firstPlayStatus": true,"isVipFree": false}}

这个接口还是比较简单的,返回值里面直接包含 m4a音频地址,没有加密措施,另外 url中的数字依然是 trackId,值得一提的是免费音频的trackId不能用在付费接口,我猜测是版本迭代的问题,或者是客户端不同的问题,因为当时我不只是分析网页的接口,还抓包了电脑客户端的接口,具体对应的是网页还是客户端我也忘了。

解析整本书的接口

喜马拉雅接口主要关键的有两个参数,一个是前面我说的 trackId 另一个就是albumId,trackId 对应唯一的一个音频,而 albumId 对应的是唯一的一本书。

https://www.ximalaya.com/revision/album/v1/getTracksList?albumId=30816438&pageNum=1&pageSize=1000

返回值中就有每一集的trackId,其实喜马拉雅还有很多其他接口,搜索接口等等,一般的其他的接口需要在请求头中加入xm-sign,我也写了xm-sign的计算方法:

import requestsimport timeimport hashlibimport 
randomimport jsonfrom requests.packages.urllib3.
exceptions import InsecureRequestWarningrequests.packages.urllib3.
disable_warnings(InsecureRequestWarning)
# 获取sign签名def get_sign(headers):    
serverTimeUrl = "https://www.ximalaya.com/revision/time"    
response = requests.get(serverTimeUrl,headers=headers,verify=False)    
serverTime = response.text    
nowTime = str(round(time.time()*1000))    
sign = str(hashlib.md5("himalaya-{}"
.format(serverTime).encode()).hexdigest()) + "({})"
.format(str(round(random.random()*100))) + serverTime +
"({})".format(str(round(random.random()*100))) + nowTime    
headers["xm-sign"] = sign    
return headersdef get_header():    
headers = {            
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36
(KHTML, like Gecko) 
Chrome/75.0.3770.90 Safari/537.36"    
}        
headers = get_sign(headers)    
return headersif __name__ == '__main__':        
# 这是一个搜索接口    
url = "https://www.ximalaya.com/revision/search/main?core=all&spellchecker=true&
device=iPhone&kw=%E9%9B%AA%E4%B8%AD%E6%82%8D%E5%88%80%E8%A1%8C&page=1&rows=20&
condition=relation&fq=&paidFilter=false"    
s = requests.get(url,headers=get_header(),verify=False)    
print(s.json())

还有很多其他接口,我就懒得说了,因为我不想写了,有了这些就可以满足我下载整本书的需求了

最终整合

我写了 喜马拉雅 扫码登陆的脚本,因为我不能每次都去复制浏览器中的 cookie,这种重复劳动太傻了

import requestsimport refrom threading import Threadimport 
timeimport requestsfrom io import BytesIOimport http.cookiejar 
as cookielibfrom PIL import Imageimport sysimport psutilfrom base64 
import b64decodeimport osrequests.packages.urllib3.disable_warnings()
class show_code(Thread):    
def __init__(self,data):        
Thread.__init__(self)        
self.data = data    
def run(self):        
img = Image.open(BytesIO(self.data))  
# 打开图片,返回PIL image对象        
img.show()def is_login(session):    
headers = {'User-Agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64) 
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36"}    
url = "https://www.ximalaya.com/revision/main/getCurrentUser"    
try:        
session.cookies.load(ignore_discard=True)    
except Exception:       
pass    response  = session.get(url,verify=False,headers=headers)    
if response.json()['ret'] == 200:        
print(response.json())        
return session,True    
else:        
return session,Falsedef login():    
if not os.path.exists(".cookie"):        
os.makedirs('.cookie')    
if not os.path.exists('.cookie/xmly.txt'):        
print("hello")        
with open(".cookie/xmly.txt",'w') as f:            
f.write("")    
session = requests.session()    
session.cookies = cookielib.LWPCookieJar(filename='.cookie/xmly.txt')    
session,status = is_login(session)    
if not status:        
url = "https://passport.ximalaya.com/web/qrCode/gen?level=L"        
response = session.get(url,verify=False)        
data = response.json()        
# with open('qrcode.jpg','wb') as f:            
# f.write(b64decode(data['img']))        
t= show_code(b64decode(data['img']))        
t.start()        
qrId = data['qrId']        
url = 'https://passport.ximalaya.com/web/qrCode/check/%s/%s' % 
(qrId,int(time.time()*1000))        
while 1:            
response = session.get(url,verify=False)            
data = response.json()            
# code = re.findall("window.wx_code='(.*?)'",response.text)            
# sys.exit()            
if data['ret'] == 0:                
# for proc in psutil.process_iter():  # 遍历当前process                    
# try:                    
#     if proc.name() == "Microsoft.Photos.exe":                      
#         proc.kill()  
# 关闭该process                   
# except Exception as e:                    
#     print(e)                
break            
time.sleep(1)        
session.cookies.save()    
return sessionif __name__ == '__main__':    login()

简单的一个扫码登陆脚本,如果cookie自动保存成文件,下次使用的时候直接调用:

session = login()

就能在保持登陆状态下,访问各种接口.最后,如果你的时间不是很紧张,并且又想快速的提高,最重要的是不怕吃苦,建议你可以联系维:762459510 ,那个真的很不错,很多人进步都很快,需要你不怕吃苦哦!大家可以去添加上看一下~