爬虫JS逆向实践-1688 【JS混淆加密解析】

655 阅读9分钟

持续创作,加速成长!这是我参与「掘金日新计划 · 10 月更文挑战」的第N天,点击查看活动详情 大家好,我是辣条。

这是爬虫系列的36篇,爬虫之路永无止境。

爬取目标 网站:阿里巴巴1688.com - 全球领先的采购批发平台,批发网

工具使用

开发工具:pycharm

开发环境:python3.7, Windows10

使用工具包:requests,urllib, time, re, execjs

重点学习的内容 JS混淆

正则表达式的使用

py执行js文件的应用

网页参数编码

页面分析 爬取:海量产地工厂,就上1688找工厂

h(d.token + "&" + i + "&" + g + "&" + c.data)

在这个JS代码前面打上断点,然后刷新页面,进行js调试。

d.token的生成 -> H5Request -> 在接口的请求头里面 的cookie 包含 改参数

i = 时间戳

g = 固定参数

c.data =请求参数

h 为 JS加密方法 把 hjs代码都扣下来 做一个单独的js文件。

function h(a) { function b(a, b) { return a << b | a >>> 32 - b } function c(a, b) { var c, d, e, f, g; return e = 2147483648 & a, f = 2147483648 & b, c = 1073741824 & a, d = 1073741824 & b, g = (1073741823 & a) + (1073741823 & b), c & d ? 2147483648 ^ g ^ e ^ f : c | d ? 1073741824 & g ? 3221225472 ^ g ^ e ^ f : 1073741824 ^ g ^ e ^ f : g ^ e ^ f } function d(a, b, c) { return a & b | ~a & c } function e(a, b, c) { return a & c | b & ~c } function f(a, b, c) { return a ^ b ^ c } function g(a, b, c) { return b ^ (a | ~c) } function h(a, e, f, g, h, i, j) { return a = c(a, c(c(d(e, f, g), h), j)), c(b(a, i), e) } function i(a, d, f, g, h, i, j) { return a = c(a, c(c(e(d, f, g), h), j)), c(b(a, i), d) } function j(a, d, e, g, h, i, j) { return a = c(a, c(c(f(d, e, g), h), j)), c(b(a, i), d) } function k(a, d, e, f, h, i, j) { return a = c(a, c(c(g(d, e, f), h), j)), c(b(a, i), d) } function l(a) { for (var b, c = a.length, d = c + 8, e = (d - d % 64) / 64, f = 16 * (e + 1), g = new Array(f - 1), h = 0, i = 0; c > i; ) b = (i - i % 4) / 4, h = i % 4 * 8, g[b] = g[b] | a.charCodeAt(i) << h, i++; return b = (i - i % 4) / 4, h = i % 4 * 8, g[b] = g[b] | 128 << h, g[f - 2] = c << 3, g[f - 1] = c >>> 29, g } function m(a) { var b, c, d = "", e = ""; for (c = 0; 3 >= c; c++) b = a >>> 8 * c & 255, e = "0" + b.toString(16), d += e.substr(e.length - 2, 2); return d } function n(a) { a = a.replace(/\r\n/g, "\n"); for (var b = "", c = 0; c < a.length; c++) { var d = a.charCodeAt(c); 128 > d ? b += String.fromCharCode(d) : d > 127 && 2048 > d ? (b += String.fromCharCode(d >> 6 | 192), b += String.fromCharCode(63 & d | 128)) : (b += String.fromCharCode(d >> 12 | 224), b += String.fromCharCode(d >> 6 & 63 | 128), b += String.fromCharCode(63 & d | 128)) } return b } var o, p, q, r, s, t, u, v, w, x = [], y = 7, z = 12, A = 17, B = 22, C = 5, D = 9, E = 14, F = 20, G = 4, H = 11, I = 16, J = 23, K = 6, L = 10, M = 15, N = 21; for (a = n(a), x = l(a), t = 1732584193, u = 4023233417, v = 2562383102, w = 271733878, o = 0; o < x.length; o += 16) p = t, q = u, r = v, s = w, t = h(t, u, v, w, x[o + 0], y, 3614090360), w = h(w, t, u, v, x[o + 1], z, 3905402710), v = h(v, w, t, u, x[o + 2], A, 606105819), u = h(u, v, w, t, x[o + 3], B, 3250441966), t = h(t, u, v, w, x[o + 4], y, 4118548399), w = h(w, t, u, v, x[o + 5], z, 1200080426), v = h(v, w, t, u, x[o + 6], A, 2821735955), u = h(u, v, w, t, x[o + 7], B, 4249261313), t = h(t, u, v, w, x[o + 8], y, 1770035416), w = h(w, t, u, v, x[o + 9], z, 2336552879), v = h(v, w, t, u, x[o + 10], A, 4294925233), u = h(u, v, w, t, x[o + 11], B, 2304563134), t = h(t, u, v, w, x[o + 12], y, 1804603682), w = h(w, t, u, v, x[o + 13], z, 4254626195), v = h(v, w, t, u, x[o + 14], A, 2792965006), u = h(u, v, w, t, x[o + 15], B, 1236535329), t = i(t, u, v, w, x[o + 1], C, 4129170786), w = i(w, t, u, v, x[o + 6], D, 3225465664), v = i(v, w, t, u, x[o + 11], E, 643717713), u = i(u, v, w, t, x[o + 0], F, 3921069994), t = i(t, u, v, w, x[o + 5], C, 3593408605), w = i(w, t, u, v, x[o + 10], D, 38016083), v = i(v, w, t, u, x[o + 15], E, 3634488961), u = i(u, v, w, t, x[o + 4], F, 3889429448), t = i(t, u, v, w, x[o + 9], C, 568446438), w = i(w, t, u, v, x[o + 14], D, 3275163606), v = i(v, w, t, u, x[o + 3], E, 4107603335), u = i(u, v, w, t, x[o + 8], F, 1163531501), t = i(t, u, v, w, x[o + 13], C, 2850285829), w = i(w, t, u, v, x[o + 2], D, 4243563512), v = i(v, w, t, u, x[o + 7], E, 1735328473), u = i(u, v, w, t, x[o + 12], F, 2368359562), t = j(t, u, v, w, x[o + 5], G, 4294588738), w = j(w, t, u, v, x[o + 8], H, 2272392833), v = j(v, w, t, u, x[o + 11], I, 1839030562), u = j(u, v, w, t, x[o + 14], J, 4259657740), t = j(t, u, v, w, x[o + 1], G, 2763975236), w = j(w, t, u, v, x[o + 4], H, 1272893353), v = j(v, w, t, u, x[o + 7], I, 4139469664), u = j(u, v, w, t, x[o + 10], J, 3200236656), t = j(t, u, v, w, x[o + 13], G, 681279174), w = j(w, t, u, v, x[o + 0], H, 3936430074), v = j(v, w, t, u, x[o + 3], I, 3572445317), u = j(u, v, w, t, x[o + 6], J, 76029189), t = j(t, u, v, w, x[o + 9], G, 3654602809), w = j(w, t, u, v, x[o + 12], H, 3873151461), v = j(v, w, t, u, x[o + 15], I, 530742520), u = j(u, v, w, t, x[o + 2], J, 3299628645), t = k(t, u, v, w, x[o + 0], K, 4096336452), w = k(w, t, u, v, x[o + 7], L, 1126891415), v = k(v, w, t, u, x[o + 14], M, 2878612391), u = k(u, v, w, t, x[o + 5], N, 4237533241), t = k(t, u, v, w, x[o + 12], K, 1700485571), w = k(w, t, u, v, x[o + 3], L, 2399980690), v = k(v, w, t, u, x[o + 10], M, 4293915773), u = k(u, v, w, t, x[o + 1], N, 2240044497), t = k(t, u, v, w, x[o + 8], K, 1873313359), w = k(w, t, u, v, x[o + 15], L, 4264355552), v = k(v, w, t, u, x[o + 6], M, 2734768916), u = k(u, v, w, t, x[o + 13], N, 1309151649), t = k(t, u, v, w, x[o + 4], K, 4149444226), w = k(w, t, u, v, x[o + 11], L, 3174756917), v = k(v, w, t, u, x[o + 2], M, 718787259), u = k(u, v, w, t, x[o + 9], N, 3951481745), t = c(t, p), u = c(u, q), v = c(v, r), w = c(w, s); var O = m(t) + m(u) + m(v) + m(w); return O.toLowerCase() } 代码实现 import requests from urllib import parse import time import re import execjs ​ ​ time1 = int(time.time() * 1000) with open('./567.js', 'r', encoding='utf-8') as f: ctx = execjs.compile(f.read()) # 执行读取的js代码 data = {"cid":"FactoryRankServiceWidget:FactoryRankServiceWidget","methodName":"execute","params":"{"extParam":"{\"methodName\":\"readRelatedRankEntries\",\"cateId\":7,\"size\":15}"}"} headers = { 'cookie': 'xlly_s=1; cna=s+HmGas+MCYCAXHwrb3t4n2g; ali_ab=240e:383:515:6f10:7c53:a1a0:ef97:a0ef.1635751613086.3; taklid=13ffc206f13c40139dcae830da1cd7a6; _m_h5_tk=9ef09cb41d7419fc32115e5eb51faa1a_1635776795184; _m_h5_tk_enc=90cb68510dccd56f830c770f0c4c9c16; alicnweb=touch_tb_at%3D1635768129101; unb=3346228600; cookie2=1bc1323f3cdd57911a40c00689bdd4cb; t=ad0b0773e5f83f3f46a8ad3ddbfb3d0b; tb_token=73b3e307beabe; cn_logon=false; UM_distinctid=17cdb6e180d368-0dd745c063c1f4-57b193e-1fa400-17cdb6e180e8ba; _csrf_token=1635769062782; keywordsHistory=%E7%94%B5%E8%84%91%3B%E7%BE%8E%E5%A6%86%3B%E7%AC%94%E8%AE%B0%E6%9C%AC%E7%94%B5%E8%84%91%3B%E7%94%B7%E7%94%9F%E5%A4%96%E5%A5%97%3B%E7%AB%A5%E8%A3%85%3B%E6%A3%89%E8%A2%AB; tfstk=cRB1Bw2q2V0_qEVqQGZEu20xQkv1ZXiWc5TGfsOGJ1eSpeS1iDDyPyvFiqd9KH1..; l=eBOC4p3VgzVguXHhXOfwourza77t7IRAguPzaNbMiOCP9T5w5HedW6UhmPTeCnGVh6zpR3WAVUVzBeYBqojidj4Kwp0OeOkmn; isg=BOjoTlOAHfQCkTFQpHAklfc6udb6EUwbw0DEqqIZLGNW_YhnSiBDq_m79ZUNTQTz', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36' }

token 生成

token = re.findall('m_h5_tk=(.+?)', headers['cookie'], re.S)[0] appKey = "12574478" p = (token + '&' + str(time1) + '&' + appKey + '&' + str(data)) sign = ctx.call('h', p) url = f'h5api.m.1688.com/h5/mtop.tao…'

需要转码 因为网页的 urldata是 转码后的

urls = url + 'data=' + parse.quote(str(data))

print(sign)

response = requests.get(urls, headers=headers).text print(response) 👇🏻 疑难解答、学习资料、路线图可通过搜索下方 👇🏻

直加辣条小助手,备注:C站

微信名片

关注博主即可阅读全文

五包辣条! 关注

18

47

0

专栏目录 最新1688商品列表接口JS逆向分析 猿小白的博客 384 重要说明:文章教程仅供参考学习,请勿用于非法用途,否则后果自负。 阿里云API网关调用事例源码 08-11 阿里云API网关调用事例源码 sign加密算法 用爬虫批量采集阿里巴巴1688商品数据 weixin_30474613的博客 2959 本文主要介绍如何使用后羿采集器的智能模式,免费采集阿里巴巴 —