Because we need extract book's content to analysis user unknown words.So we must have a method to parse epub books.
using ebooklib to split epub file
def test_open_epub(self):
book_path = Path(__file__).resolve().parent.joinpath(test_filename)
book = epub.read_epub(book_path)
import os
output_dir = "pdf_chapters"
for chapter in book.get_items():
chapter_file = os.path.basename(chapter.file_name)
output_file = os.path.join(output_dir, chapter_file)
with open(output_file, 'wb') as f:
f.write(chapter.get_content())
After this we will get a lot of picture and html files
but those html using relative image path.
we can to change those to src="./00049.jpeg" but I think another way is to save html files in root/html/ folder and save image files in root/images/ folder.
def test_open_epub(self):
book_path = Path(__file__).resolve().parent.joinpath(test_filename)
book = epub.read_epub(book_path)
import os
output_dir = "pdf_chapters"
for item in book.get_items():
chapter_file = os.path.basename(item.file_name)
if item.get_type() == 1:
# image
output_folder = os.path.join(output_dir, 'images')
elif item.get_type() == 2:
# css
output_folder = os.path.join(output_dir, 'css')
elif item.get_type() == 3:
# javascript
output_folder = os.path.join(output_dir, 'js')
elif item.get_type() == 6:
# font
output_folder = os.path.join(output_dir, 'fonts')
elif item.get_type() == 9:
# document
output_folder = os.path.join(output_dir, 'html')
else:
output_folder = os.path.join(output_dir)
# create folder it is not exist
if not os.path.exists(output_folder):
os.makedirs(output_folder)
with open(os.path.join(output_folder, chapter_file), 'wb') as f:
f.write(item.get_content())
Here we use fieldname as a django FileField to auto save our file into MEDIA_URL. Then we do split job.But it raise a error.Maybe this time the fild is not complete saved on our disk.So we move this logic to:
Because this our first time to use sender so we write another useful function to delete file after the databse record is deleted. And it works!
So we can let every book has a separate folder with book id(because title may change):
@receiver(post_save, sender=Book)
def split_epub(sender, instance, created, **kwargs):
if created:
# do epub split for each book
book_path = settings.MEDIA_ROOT.joinpath(instance.filename.path)
output_dir = settings.EPUB_CHAPTERS_MEDIA_ROOT.joinpath(str(instance.id))
do_split(book_path=book_path, output_dir=output_dir)
frontend read splited files
Now we have epub's chapters file but which is FIRST and how can we understand their sequence?
Thanks for ebooklib support toc. We can get this and parse it to a recursive list:
But href is text/xxx.html and our file name is xxx.html. So we remove this prefix in for loop. Also some href have hash link so we need ignore them.
Test case
class SplitEpubTestCase(TestCase):
output_dir = settings.EPUB_CHAPTERS_MEDIA_ROOT.joinpath('for_test')
book_path = Path(__file__).resolve().parent.joinpath(test_filename)
@classmethod
def tearDownClass(cls):
if check_folder_exists(cls.output_dir):
shutil.rmtree(cls.output_dir)
def test_split_epub_as_chapters_and_have_toc(self):
# check do split job
do_split(book_path=self.book_path, output_dir=self.output_dir)
self.assertTrue(check_folder_exists(self.output_dir))
# check toc
toc = get_epub_toc(self.book_path)
self.assertTrue(len(toc) > 0)
output_html_dir = self.output_dir.joinpath(
'html'
)
def recursive_check(toc_list):
for item in toc_list:
# some href have hash
href = item.get('href').split('#')[0]
self.assertTrue(href)
self.assertTrue(item.get('title'))
exists = output_html_dir.joinpath(href).exists()
if not exists:
print(f'not exists: {href}')
self.assertTrue(exists)
children = item.get('children')
if children:
recursive_check(children)
recursive_check(toc)
What bad luck
Some book have different image and html folder relation just like this:
😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂
We think too much.We should keep file origin folder and relation and simple save them so each images will ok.
Delete useless code:
Last Test Case
We use two different format epub book to test:
test_filename = 'The_Sense_of_Style.epub'
test_filename2 = 'alice.epub'
class SplitEpubTestCase(TestCase):
output_dir = settings.EPUB_CHAPTERS_MEDIA_ROOT.joinpath('for_test')
book_path = Path(__file__).resolve().parent.joinpath(test_filename)
output_dir2 = settings.EPUB_CHAPTERS_MEDIA_ROOT.joinpath('for_test2')
book_path2 = Path(__file__).resolve().parent.joinpath(test_filename2)
@classmethod
def tearDownClass(cls):
if check_folder_exists(cls.output_dir):
# shutil.rmtree(cls.output_dir)
pass
def test_split_epub_as_chapters_and_have_toc(self):
self.do_test(self.book_path, self.output_dir)
self.do_test(self.book_path2, self.output_dir2)
def do_test(self, book_path, output_dir):
# check do split job
do_split(book_path=book_path, output_dir=output_dir)
self.assertTrue(check_folder_exists(output_dir))
# check toc
toc = get_epub_toc(book_path)
self.assertTrue(len(toc) > 0)
output_html_dir = output_dir
def recursive_check(toc_list):
for item in toc_list:
# some href have hash
href = item.get('href').split('#')[0]
self.assertTrue(href)
self.assertTrue(item.get('title'))
exists = output_html_dir.joinpath(href).exists()
if not exists:
print(f'not exists: {href}')
self.assertTrue(exists)
children = item.get('children')
if children:
recursive_check(children)
recursive_check(toc)
Ok.It works.
Thanks for reading!