How to implement article words extract and analysis? (02)

89 阅读2分钟

Because we need extract book's content to analysis user unknown words.So we must have a method to parse epub books.

using ebooklib to split epub file

def test_open_epub(self):
    book_path = Path(__file__).resolve().parent.joinpath(test_filename)
    book = epub.read_epub(book_path)
    import os
    output_dir = "pdf_chapters"
    for chapter in book.get_items():
        chapter_file = os.path.basename(chapter.file_name)
        output_file = os.path.join(output_dir, chapter_file)

        with open(output_file, 'wb') as f:
            f.write(chapter.get_content())
    

After this we will get a lot of picture and html files

image.png

but those html using relative image path.

image.png

we can to change those to src="./00049.jpeg" but I think another way is to save html files in root/html/ folder and save image files in root/images/ folder.


def test_open_epub(self):
    book_path = Path(__file__).resolve().parent.joinpath(test_filename)
    book = epub.read_epub(book_path)
    import os
    output_dir = "pdf_chapters"
    for item in book.get_items():
        chapter_file = os.path.basename(item.file_name)
        if item.get_type() == 1:
            # image
            output_folder = os.path.join(output_dir, 'images')
        elif item.get_type() == 2:
            # css
            output_folder = os.path.join(output_dir, 'css')
        elif item.get_type() == 3:
            # javascript
            output_folder = os.path.join(output_dir, 'js')
        elif item.get_type() == 6:
            # font
            output_folder = os.path.join(output_dir, 'fonts')
        elif item.get_type() == 9:
            # document
            output_folder = os.path.join(output_dir, 'html')
        else:
            output_folder = os.path.join(output_dir)
        # create folder it is not exist
        if not os.path.exists(output_folder):
            os.makedirs(output_folder)

        with open(os.path.join(output_folder, chapter_file), 'wb') as f:
            f.write(item.get_content())

image.png

image.png

Here we use fieldname as a django FileField to auto save our file into MEDIA_URL. Then we do split job.But it raise a error.Maybe this time the fild is not complete saved on our disk.So we move this logic to:

Because this our first time to use sender so we write another useful function to delete file after the databse record is deleted. And it works!

image.png

So we can let every book has a separate folder with book id(because title may change):

@receiver(post_save, sender=Book)
def split_epub(sender, instance, created, **kwargs):
    if created:
        # do epub split for each book
        book_path = settings.MEDIA_ROOT.joinpath(instance.filename.path)
        output_dir = settings.EPUB_CHAPTERS_MEDIA_ROOT.joinpath(str(instance.id))
        do_split(book_path=book_path, output_dir=output_dir)

frontend read splited files

Now we have epub's chapters file but which is FIRST and how can we understand their sequence?

Thanks for ebooklib support toc. We can get this and parse it to a recursive list:

image.png

But href is text/xxx.html and our file name is xxx.html. So we remove this prefix in for loop. Also some href have hash link so we need ignore them.


Test case


class SplitEpubTestCase(TestCase):
    output_dir = settings.EPUB_CHAPTERS_MEDIA_ROOT.joinpath('for_test')
    book_path = Path(__file__).resolve().parent.joinpath(test_filename)

    @classmethod
    def tearDownClass(cls):
        if check_folder_exists(cls.output_dir):
            shutil.rmtree(cls.output_dir)

    def test_split_epub_as_chapters_and_have_toc(self):
        # check do split job
        do_split(book_path=self.book_path, output_dir=self.output_dir)
        self.assertTrue(check_folder_exists(self.output_dir))

        # check toc
        toc = get_epub_toc(self.book_path)
        self.assertTrue(len(toc) > 0)
        output_html_dir = self.output_dir.joinpath(
            'html'
        )

        def recursive_check(toc_list):
            for item in toc_list:
                # some href have hash
                href = item.get('href').split('#')[0]
                self.assertTrue(href)
                self.assertTrue(item.get('title'))
                exists = output_html_dir.joinpath(href).exists()
                if not exists:
                    print(f'not exists: {href}')
                self.assertTrue(exists)
                children = item.get('children')
                if children:
                    recursive_check(children)

        recursive_check(toc)

What bad luck

Some book have different image and html folder relation just like this:

image.png

image.png

😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂

We think too much.We should keep file origin folder and relation and simple save them so each images will ok.

Delete useless code:

image.png

image.png

Last Test Case

We use two different format epub book to test:

test_filename = 'The_Sense_of_Style.epub'
test_filename2 = 'alice.epub'

class SplitEpubTestCase(TestCase):
    output_dir = settings.EPUB_CHAPTERS_MEDIA_ROOT.joinpath('for_test')
    book_path = Path(__file__).resolve().parent.joinpath(test_filename)

    output_dir2 = settings.EPUB_CHAPTERS_MEDIA_ROOT.joinpath('for_test2')
    book_path2 = Path(__file__).resolve().parent.joinpath(test_filename2)

    @classmethod
    def tearDownClass(cls):
        if check_folder_exists(cls.output_dir):
            # shutil.rmtree(cls.output_dir)
            pass

    def test_split_epub_as_chapters_and_have_toc(self):
        self.do_test(self.book_path, self.output_dir)
        self.do_test(self.book_path2, self.output_dir2)

    def do_test(self, book_path, output_dir):
        # check do split job
        do_split(book_path=book_path, output_dir=output_dir)
        self.assertTrue(check_folder_exists(output_dir))

        # check toc
        toc = get_epub_toc(book_path)
        self.assertTrue(len(toc) > 0)
        output_html_dir = output_dir

        def recursive_check(toc_list):
            for item in toc_list:
                # some href have hash
                href = item.get('href').split('#')[0]
                self.assertTrue(href)
                self.assertTrue(item.get('title'))
                exists = output_html_dir.joinpath(href).exists()
                if not exists:
                    print(f'not exists: {href}')
                self.assertTrue(exists)
                children = item.get('children')
                if children:
                    recursive_check(children)

        recursive_check(toc)

Ok.It works.

image.png

image.png


Thanks for reading!