思路
- 递归获取src目录下所有的文件。
- 使用GBK读取文件内容,然后使用utf-8编码,写入文件。
- 配合git就知道哪些文件有修改过了。
脚本1
import os
import re
import sys
import chardet
def show_files(path):
all_files = []
file_list = os.listdir(path)
for file in file_list:
cur_path = os.path.join(path, file)
if os.path.isdir(cur_path):
all_files.extend(show_files(cur_path))
else:
append_file(all_files, cur_path)
return all_files
def append_file(all_files, file_name):
if re.match('.*.java', file_name):
rt = charset_detect(file_name)
if rt:
all_files.append(rt)
def charset_detect(file_name) -> object:
fop = open(file_name, 'rb')
line = fop.read()
charset = chardet.detect(line).get('encoding').lower()
if charset != 'ascii' and charset != 'utf-8':
pass
else:
file_name = None
fop.close()
return file_name
def filter_files(lst):
pass
if __name__ == '__main__':
linesep = os.linesep
path = sys.argv[1]
back_path = sys.argv[2]
contents = show_files(path)
error_list = []
suc_list = []
for content in contents:
with open(content, 'rb+') as f:
info = f.read()
try:
info = info.decode('gb2312').encode('utf-8')
except UnicodeDecodeError:
print(content)
error_list.append(content)
else:
f.seek(0)
f.truncate()
f.write(info)
suc_list.append(content)
with open(back_path, 'w+') as back:
back.write('error_list: '+linesep)
for i, err_file in enumerate(error_list):
back.write(' Seq: {0}'.format(i) + err_file + linesep)
back.write('suc_list: ' + linesep)
for i, suc_file in enumerate(suc_list):
back.write(' Seq: {0}'.format(i) + suc_file + linesep)
脚本2
import os
import chardet
from chardet.universaldetector import UniversalDetector
def get_encoding(file):
"""
获取文本文件的编码类型
:param file:
:return: 返回值是字典 {'encoding': 'utf-8', 'confidence': 0.99, 'language': ''}
"""
txt = open(file, "rb")
detector = UniversalDetector()
for line in txt.readlines():
detector.feed(line)
if detector.done:
break
detector.close()
txt.close()
return detector.result
def override_content(filepath,encoding):
print('修改的文件 ==> ', encoding ,' ==> ',filepath)
with open(filepath, 'rb+') as fp:
content = fp.read()
content = content.decode(encoding['encoding'],'ignore').encode("utf8")
fp.seek(0)
fp.write(content)
def change(path):
fns = (fn for fn in os.listdir(path))
for f in fns:
fp = os.path.join(path,f);
if os.path.isdir(fp):
change(fp)
else:
if f.endswith('.java'):
code = get_encoding(fp)
if code['encoding'] == 'utf-8':
continue
else:
print('文件编码:', code, fp)
pass
override_content(fp,code)
if __name__ =="__main__":
d= './src'
print('查找的文件夹是'+d)
change(d)