利用Python把java文件GBK编码转成UTF-8

334 阅读1分钟

思路

  1. 递归获取src目录下所有的文件。
  2. 使用GBK读取文件内容,然后使用utf-8编码,写入文件。
  3. 配合git就知道哪些文件有修改过了。

脚本1

# /usr/bin/python3 
# -*- coding:UTF-8 -*-
# 把文件编码转为UTF-8编码
# 安装:pip3 install chardet
# 说明: 配合git使用,把目录下的的java文件转成utf-8编码
# 使用: python3 demo.py 转换目录 a.log
# cmd参数第一个传目录,第二个传生成日志的信息名 
import os
import re
import sys

import chardet

# 递归找到所有的路径
def show_files(path):
    all_files = []
    file_list = os.listdir(path)
    for file in file_list:
        cur_path = os.path.join(path, file)
        if os.path.isdir(cur_path):
            all_files.extend(show_files(cur_path))
        else:
            append_file(all_files, cur_path)
    return all_files


def append_file(all_files, file_name):
    if re.match('.*.java', file_name):
        rt = charset_detect(file_name)
        if rt:
            all_files.append(rt)


def charset_detect(file_name) -> object:
    fop = open(file_name, 'rb')
    line = fop.read()
    charset = chardet.detect(line).get('encoding').lower()
    if charset != 'ascii' and charset != 'utf-8':
        pass
    else:
        file_name = None
    fop.close()
    return file_name


def filter_files(lst):
    pass


if __name__ == '__main__':
    linesep = os.linesep
    path = sys.argv[1]
    back_path = sys.argv[2]
    contents = show_files(path)

    error_list = []
    suc_list = []

    for content in contents:
        with open(content, 'rb+') as f:
            info = f.read()
            try:
                info = info.decode('gb2312').encode('utf-8')
            except UnicodeDecodeError:
                print(content)
                error_list.append(content)
            else:
                f.seek(0)
                f.truncate()
                f.write(info)
                suc_list.append(content)
    with open(back_path, 'w+') as back:
        back.write('error_list: '+linesep)

        for i, err_file in enumerate(error_list):
            back.write('    Seq: {0}'.format(i) + err_file + linesep)

        back.write('suc_list: ' + linesep)
        for i, suc_file in enumerate(suc_list):
            back.write('    Seq: {0}'.format(i) + suc_file + linesep)

脚本2

# coding=utf-8
# pip install chardet
# 使用配合git,新建一条分支。
# 在根目录下运行 python2 gbk2utf8.py 或者 python2 gbk2utf8.py 即可 (优先使用py3)
# 重新运行项目,确保代码可运行在提交
import os
import chardet
from chardet.universaldetector import UniversalDetector
def get_encoding(file):
    """
    获取文本文件的编码类型
    :param file:
    :return: 返回值是字典 {'encoding': 'utf-8', 'confidence': 0.99, 'language': ''}
    """
    txt = open(file, "rb")
    detector = UniversalDetector()
    for line in txt.readlines():
        detector.feed(line)
        if detector.done:
            break
    detector.close()
    txt.close()
    return detector.result


def override_content(filepath,encoding):
    print('修改的文件 ==> ', encoding ,' ==> ',filepath)
    with open(filepath, 'rb+') as fp:
        content = fp.read()
        # GB2312 < GBK < GB18030
        content = content.decode(encoding['encoding'],'ignore').encode("utf8")
        fp.seek(0)
        fp.write(content)

def change(path):
#     print('查找目标是'+path,os.path.isdir(path))
    fns = (fn for fn in os.listdir(path))
    for f in fns:
        fp = os.path.join(path,f);
        if os.path.isdir(fp):
            # print('===================> 找到目录:', fp)
            change(fp)
        else:
            # print('         找到文件:', f )
            if f.endswith('.java'):
                code = get_encoding(fp)

                # GB2312
                # print('文件编码:', code.encoding , fp)
                if code['encoding'] == 'utf-8':
                    continue
                    # standardized_file_encode(fp)
                    # print('文件编码:', code, fp)
                else:
                    print('文件编码:', code, fp)
                    pass
                override_content(fp,code)


if __name__ =="__main__":
    d= './src'
    print('查找的文件夹是'+d)
    change(d)