下载漫画只需一个脚本

984 阅读4分钟



近来比较闲,想利用这闲暇的时光恶补下op漫画。打开 鼠绘漫画网,找到onepiece的目录,点击需要补习的篇章。本来准备吹着空调,吃着西瓜愉快的享受op的世界了。没想到的事发生了,在鼠标点击的瞬间出现了跳转




然后就眼睁睁的跳转到腾讯蛋疼的简体汉化版,此时心中无数个草泥马在奔腾啊!!!

由于版权纷争,鼠绘为了生计不得不退出。并且转型为漫画讨论社区,虽然最近几期都给了第三方链接,但是总给人的感觉是偷呢?好在,手机端APP还保留着漫画阅读功能,但是由于屏幕狭小,体验不是很好。于是就有了本篇简单爬虫想法。

前期利用App进行抓包调研鼠绘网后台接口,得到了几个关键的接口。之后就写了如下脚本:
脚本下载地址: cartoon_download.py
#! /usr/bin/python
# -*- coding:utf-8 -*-
import urllib
import socket
import json
import os
import HTMLParser
import threading
import time
import sys
global jsonstr, hostname, croot, maxtimeout, tryagaincount
global threadcount, currentthreadnum, mutex, thenewest
hostname = r'http://www.ishuhui.net'
chapters = r'/ComicBooks/GetChapterList?id=' + '%d' + '&PageIndex=' + '%d'
details = r'/ComicBooks/ReadComicBooksToIsoV1/' + '%d'
repo = {
    'op': 2,
    'SLAM_DUNK': 38,
    '火影忍者': 4,
    '银魂': 10,
    '妖精的尾巴': 3,
    '名侦探柯南': 1,
    'bleach': 23,
    '黑子的篮球': 6,
    '浪客剑心': 39,
    '结界师': 34
}
arg = '[op, SLAM_DUNK, 火影忍者, 银魂, 妖精的尾巴, 名侦探柯南, bleach, 黑子的篮球, 浪客剑心, 结界师]'
cartoonid = repo['op']
thenewest = 0
currentthreadnum = 0
threadcount = 6
tryagaincount = 5
maxtimeout = 30
croot = os.getcwd()
mutex = threading.Lock()
usage = \
    """
    Usage:
            cartoon_download [args...]
            cartoon_download cartoon
            cartoon_download cartoon path
            cartoon_download cartoon path newestcount
            cartoon_download cartoon path newestcount threadcount
    For example:
            cartoon_download op /home/xxx/onepiece
    Note:
            current version just support Linux/Unix os.
            cartoon = %s
            path can be either absolute or relative, but must be en characters.
            default `newestcount` value is 0 to download all chapters, or
            download the newest value chapters.
    """ % arg
if len(sys.argv) == 1:
    print usage
    sys.exit(0)
if len(sys.argv) >= 2:
    try:
        cartoonid = repo[sys.argv[1]]
    except Exception, e:
        print 'Please select from %s' % arg
        sys.exit(0)
    finally:
        pass
if len(sys.argv) >= 3:
    targetdir = sys.argv[2]
    if not os.path.exists(targetdir):
        os.makedirs(targetdir)
    croot = os.path.abspath(targetdir)
    if len(sys.argv) >= 4:
        thenewest = (int)(sys.argv[3])
    if len(sys.argv) == 5:
        threadcount = (int)(sys.argv[4])
class MyHTMLParser(HTMLParser.HTMLParser):
    def __init__(self):
        HTMLParser.HTMLParser.__init__(self)
        self.gifs = []
        self.jpgs = []
        self.pngs = []
    def handle_starttag(self, tags, attrs):
        if tags == 'img':
            for attr in attrs:
                for htmlstr in attr:
                    if 'gif' in htmlstr:
                        self.gifs.append(htmlstr)
                    elif 'jpg' in htmlstr:
                        self.jpgs.append(htmlstr)
                    elif 'png' in htmlstr:
                        self.pngs.append(htmlstr)
                    else:
                        pass
    def get_gifs(self):
        return self.gifs
    def get_jpgs(self):
        return self.jpgs
    def get_pngs(self):
        return self.pngs
class DownloadTask(threading.Thread):
    def __init__(self, name, srcurl):
        threading.Thread.__init__(self)
        self.name = name
        self.srcurl = srcurl
    def run(self):
        pic2file(self.srcurl)
def createtask(imgs):
    global mutex, currentthreadnum, threadcount
    print 'current tasks num >> %d' % len(imgs)
    for srcurl in imgs:
        while (currentthreadnum >= threadcount):
            time.sleep(0.5)
        increasethread()
        threadname = '#%s' % srcurl
        task = DownloadTask(threadname, srcurl)
        task.start()
    while (currentthreadnum > 0):
        time.sleep(0.5)
    print 'finished!'
    os.chdir(croot)
def parser2name(picurl):
    names = picurl.split('/')
    result = names[len(names) - 1]
    if '?' in result:
        result = result.split('?')[0]
    return result
def increasethread():
    global currentthreadnum, mutex
    mutex.acquire()
    currentthreadnum += 1
    mutex.release()
def decreasethread():
    global currentthreadnum, mutex
    mutex.acquire()
    currentthreadnum -= 1
    mutex.release()
def pic2file(picurl, times=0):
    protocol = picurl.split('/')[0]
    if not(protocol == 'http:' or protocol == 'https:'):
        decreasethread()
        return
    filename = parser2name(picurl)
    if(os.path.exists(filename)):
        decreasethread()
        return
    try:
        pic = urllib.urlopen(picurl)
        data = pic.read()
        picfile = open('%s' % filename, 'wb')
        picfile.write(data)
        picfile.close()
        decreasethread()
    except socket.timeout:
        if times < tryagaincount:
            print "Download '%s' timeout, Trying again." % filename
            pic2file(picurl, times + 1)
        else:
            decreasethread()
            print "Tried %d times, but still failed to %s." %\
                (tryagaincount, filename)
    except Exception as e:
        print('---pic2file error---', e)
        if times < tryagaincount:
            print "Download '%s' timeout, Trying again." % filename
            pic2file(picurl, times + 1)
        else:
            decreasethread()
            print "Task '%s' failed after tring %d times" %\
                (picurl, tryagaincount)
    finally:
        pass
def fixurl(picurl):
    piece = picurl.split('/')
    url = 'http:'
    for p in xrange(1, len(piece)):
        url += '/%s' % piece[p]
    return url
def fetchres(detailurl, dirpath, times=0):
    try:
        if not os.path.exists(dirpath):
            os.mkdir(dirpath)
        os.chdir(dirpath)
        curdir = os.getcwd()
        print 'Download for ' + curdir
        detailbook = urllib.urlopen(detailurl).read()
        htmlfile = open('%s.html' % parser2name(curdir), 'wb')
        htmlfile.write(detailbook)
        htmlfile.close()
        parser = MyHTMLParser()
        parser.feed(detailbook)
        jpgs = parser.get_jpgs()
        pngs = parser.get_pngs()
        gifs = parser.get_gifs()
        imgs = jpgs + pngs + gifs
        createtask(imgs)
    except socket.timeout:
        print "Fetch '%s' timeout." % detailurl
        if times < tryagaincount:
            print "The no.%d times to try." % (times + 2)
            fetchres(detailurl, dirpath, times + 1)
        else:
            print "Tried %d times, but still failed."
            print "####Please check network!####"
    except Exception, e:
        print(e)
    finally:
        pass
def jsonparse():
    encodejson = json.loads(jsonstr)
    result = encodejson["Return"]["List"]
    if thenewest > 0:
        for i in xrange(0, thenewest):
            parserandsavehtml(result[i])
        return False
    else:
        for x in result:
            parserandsavehtml(x)
        return len(result)
def parserandsavehtml(item):
    bookid = item["Id"]
    detailurl = hostname + details % bookid + '.html'
    chapterno = item["ChapterNo"]
    title = item["Title"]
    dirname = '%d %s' % (chapterno, title)
    # Here cause an error `UnicodeEncodeError` when
    # the 'croot' include chinese characters.
    # details info can seach for http://www.cnblogs.com/abcat/p/3389531.html
    targetpath = os.path.join(croot, dirname)
    fetchres(detailurl, targetpath)
def calculatetime(used):
    if used <= 60:
        print 'Total used time is %ds.' % used
    elif used <= 3600:
        print 'Total used time is %dmins %ds.' % (used / 60, used % 60)
    else:
        print 'Total used time is %dhrs %dmins %ds.' %\
            (used / 3600, (used % 3600) / 60, (used % 3600) % 60)
if __name__ == '__main__':
    os.chdir(croot)
    socket.setdefaulttimeout(maxtimeout)
    start = time.time()
    try:
        i = 0
        isbreak = True
        while isbreak:
            targetweb = hostname + chapters % (cartoonid, i)
            webfile = urllib.urlopen(targetweb)
            jsonstr = webfile.read()
            isbreak = jsonparse()
            i = i + 1
        end = time.time()
        calculatetime(int(end - start))
    except socket.timeout:
        print('timeout')
    except Exception as e:
        print('error', e)
    finally:
        pass
        


最后在po一张路飞仙人模式图O(∩_∩)O



声明:本脚本仅供学习交流使用,禁止用于商业用途,若带来商业纠纷与本人无关!