Python文件处理:转码 移动 均分 excel行列处理

256 阅读5分钟

任务一

1.将文件解析为16进制格式

2.将解析文件移入自定义文件夹

3.对解析文件自定义均分

from ntpath import join
import os
import re
import shutil
from unicodedata import name
import pandas as pd
import sys
from tqdm import tqdm

# name = "F:\\01data\\2021-04-18-13-46-58.TXT"
# o_dir = "F:\\01data\\测试数据\\3\\LOG\\"
'''
将原始.TXT文件解析为16进制
IN: 输入文件地址,输出文件地址
F: ord(c)转换为ASCII码
'''
def convert2hex(inputfile,outputfile):
    f = open(inputfile, "rb")
    outfile = open(outputfile, "w")
    i = 0
    while 1:
        c = f.read(1)
        i = i + 1
        if not c:
            break
        # if i%32 == 0:
        #     outfile.write("\n") #换行
        else:
            if ord(c) <= 15:
                outfile.write(("0x0"+hex(ord(c))[2:])[2:]+" ")
            else:
                outfile.write((hex(ord(c)))[2:]+" ")
    outfile.close()
    f.close()

'''
将.log文件移入自定义文件夹(LOG)
'''
def movelog(absolutepath):
    list = os.listdir(absolutepath)#文件夹下的内容
    # list.sort(key=lambda x: int(x.split("-")[2]))#排序
    # print(list)
    for f in tqdm(list):#遍历
        # print(f)
        if f.find(".log") != -1:
            dst_dir = absolutepath+ "LOG" +"\\"
            oldname = absolutepath + f #文件所在地址
            newname = dst_dir + f #目标地址
            if os.path.exists(dst_dir):
                    os.rename(oldname, newname)#直接移出
                    # shutil.copy2(oldname,newname)#复制移出
                    # print("%s====>>>%s \n" % (oldname, newname))
            else:
                    os.mkdir(dst_dir)
                    os.rename(oldname, newname)#直接移出
                    # shutil.copy2(oldname,newname)#复制移出
                    # print("%s====>>>%s \n" % (oldname, newname))
        else:
            pass
'''
将log按照每num个分成一份
IN: 文件路径,num
e.g:分割后的文件树
LOG(一级文件夹)
    |20210418(15-18)(二级文件夹)
        |1.log
        |2.log
        ...
        |num.log
    |20210418(19-22)
        |1.log
        |2.log
        ...
        |num.log
    |20210418(23-02)
        |1.log
        |2.log
        ...
        |num.log

'''
def splitlog(absolutepath,num):
    file_path = absolutepath
    new_file_path = absolutepath
    list = os.listdir(absolutepath)#文件夹下的内容
    #######################若读入顺序混乱需要先排序############################
    # list.sort(key=lambda x: (int(x.split("-")[2]),int(x.split("-")[3])))#多条件排序
    cnt = 0
    num_file = int(len(list)/num)
    for n in tqdm(range(1, num_file + 1)):  # 创建文件夹
        # new_file = os.path.join(new_file_path + str(n)) #简单命名
        #################复杂命名###################
        headname = list[num*cnt].split(".log")[0]
        footname = list[num*(cnt+1)-1].split(".log")[0]
        nametime = headname.split("-")[:3]
        nametime = ''.join(nametime)
        new_file = os.path.join(new_file_path + nametime +
                                '('+str(headname.split("-")[3])+"-"+
                                str(footname.split("-")[3])+')')
        # if os.path.exists(new_file + str(cnt)):
        #     print('该路径已存在,请解决冲突', new_file)
        #     exit()
        # print('创建文件夹:', new_file)
        os.mkdir(new_file)
        list_n = list[num * cnt:num * (cnt + 1)]
        for m in list_n:
            old_path = os.path.join(file_path, m)
            new_path = os.path.join(new_file, m)
            os.rename(old_path, new_path)#剪切移出
            # shutil.copy(old_path, new_path) #复制移出
        cnt += 1


'''
批量处理将.TXT转为.log
'''
def muliconverthex(o_dir):
    dirs = os.listdir(o_dir) #文件夹
    for dir in tqdm(dirs):
        if dir.find(".TXT") != -1:
            name = o_dir+"\\"+dir.split(".TXT")[0]
            convert2hex(name+".TXT",name+".log")
        else:
            pass


if __name__ == '__main__':
    # o_dir = "F:\\01data\\测试数据\\tst\\"
    #路径后要加“\\”或“/”
    o_dir = sys.argv[1]
    muliconverthex(o_dir)
    movelog(o_dir)
    splitlog(o_dir+"\\LOG\\",4)

任务二

1.解析文件内容自定义分割

2.分割后文件处理存为xlsx

3.xlsx文件合并为csv

import os
import re
import shutil
import pandas as pd
import sys
from tqdm import tqdm

'''
将16进制LOG文件转换为规定格式的TXT文件
IN: 
    search_text需要替换的文本
    replace_text替换后的文本
    dirfiles包含log文件的文件夹
    absolutepath文件夹的绝对路径
F: 
    data.replace(search_text, replace_text)
e.g:
    search_text = "85 00 00"
    replace_text = "\n85 00 00"
'''
def LOG2TXT(search_text,replace_text,dirfiles,absolutepath):
    for dir in tqdm(dirfiles):
        if dir.find(".log") != -1:
            name = absolutepath+"\\"+dir.split(".log")[0]
        #   print(name)
            #读取log文件并替换
            with open(name + ".log", 'r',encoding='UTF-8') as file:
                data = file.read()
                data = data.replace(search_text, replace_text)
                file.close()

            #存为txt文件
            with open(name + ".txt", 'w',encoding='UTF-8') as fl:
                fl.write(data)
                fl.close()
        else:
            pass
'''
将txt文件转换为xlsx
IN:dirfiles包含txt文件的文件夹
    absolutepath文件夹的绝对路径
F:
    1.使用.read_csv()读取txt文件,跳过第一行和最后一行,保留3,4,5列,并命名为'KEY','RAW_H','RAW_L'
    2.将'RAW_H','RAW_L'的10进制计算结果放在'RAW_HD','RAW_LD'列;将'RAW_HD','RAW_LD'的和放入RAW列
    3.将上述文件存入xlsx的sheet1中
    4.将'KEY'=i 对应的'RAW'值写入sheet2的第i列,并命名为RAWi;i为数字0到6
    5.保存xlsx
'''

def TXT2XLSX(dirfiles,absolutepath):
    for dir in tqdm(dirfiles):
        if dir.find(".txt") != -1:
            name = absolutepath+"\\"+dir.split(".txt")[0]
            #pandas读取txt文件
            data = pd.read_csv(name + ".txt", 
                                sep="\s+",
                                usecols=[3,4,5],
                                skiprows=1,
                                skipfooter=1, 
                                header=None,
                                names=['KEY','RAW_H','RAW_L'],
                                # error_bad_lines=False,
                                engine='python')

            #指定列16进制转10进制
            data['RAW_HD'] = data['RAW_H'].str[:].apply(lambda x: int(x,16)*256)
            data['RAW_LD'] = data['RAW_L'].str[:].apply(lambda x: int(x,16))
            data.eval("RAW = RAW_HD + RAW_LD ",inplace = True)

            # print(data)

            # 创建Pandas写入Excel对象
            writer = pd.ExcelWriter(name+'.xlsx')

            # 将data写入Sheet1中
            data.to_excel(writer, sheet_name='Sheet1', 
                        startrow=0, startcol=0, header=True, index=False)

            #提取匹配列到sheet2
            for i in range(0,7):
                DFK = data.loc[data['KEY'] == i, ['RAW']]
                DFK.rename(columns = {'RAW':'RAW'+str(i)}, inplace = True)
                DFK.to_excel(writer, sheet_name='Sheet2', 
                        startrow=0, startcol=i, header=True, index=False)

            #写入xlsx文件
            writer.save()
        else:
            pass
'''
将所有xlsx中sheet2的值合并到csv中
IN:
    i 文件夹名
    absolutepath 文件夹所在的绝对地址
    dirfiles 文件名
F:  使用.to_csv()写入,写入模式为"a"不覆盖写入
'''
def APPENDXLSX(i,dirfiles,absolutepath):
    for dir in tqdm(dirfiles):
        if dir.find(".xlsx") != -1:
            name = absolutepath+"\\"+dir.split(".xlsx")[0]
            print(name)

            #pandas读取xlsx文件
            data = pd.read_excel(name + ".xlsx", sheet_name="Sheet2")
            # df = data.applymap(lambda x: str(x).strip()) #去除空格
            
            #pandas不覆盖写入csv文件
            data.to_csv(absolutepath+"\\"+str(i)+"_"+'ALL.csv', mode='a', index=False, header=0)
        else:
            pass

'''
文件夹循环
IN: o_dir 一级文件夹
    search_text,replace_text同函数LOG2TXT()
e.g:
DIR (一级文件夹)
    |DIR1(二级文件夹)
        |FILE11(文件)
        |FILE12
        ...
    |DIR2(二级文件夹)
        |FILE21
        |FILE22
        ...
    |DIR3(二级文件夹)
        |FILE31
        |FILE32
        ...
'''

def loopdirs(o_dir,search_text,replace_text):
    dirs = os.listdir(o_dir) #文件夹
    for i in tqdm(dirs):  # 遍历所有当前路径下的文件夹
        absolutepath = o_dir + i+"\\"  #得到文件夹所在的绝对路径
        dirfiles = os.listdir(absolutepath)#文件夹下的内容
        # print(absolutepath)
        LOG2TXT(search_text,replace_text,dirfiles,absolutepath)
    for i in tqdm(dirs):  
        absolutepath = o_dir + i+"\\"  
        dirfiles = os.listdir(absolutepath)
        # print(absolutepath)
        TXT2XLSX(dirfiles,absolutepath)
    for i in tqdm(dirs):  
        absolutepath = o_dir + i+"\\"  
        dirfiles = os.listdir(absolutepath)
        #######################若读入顺序混乱需要先排序############################
        # dirfiles.sort(key=lambda x: (int(x.split("-")[2]),int(x.split("-")[3])))#多条件排序
        # print(absolutepath)    
        APPENDXLSX(i,dirfiles,absolutepath)




if __name__ == '__main__':
    # o_dir = "F:\\01data\\测试数据\\tst\\LOG\\"
    #路径后要加“\\”或“/”
    o_dir = sys.argv[1]
    
    search_text = "85 00 00"
    replace_text = "\n85 00 00"
    loopdirs(o_dir,search_text,replace_text)
    # LOG2TXT(search_text,replace_text,o_dir)
    # TXT2XLSX(o_dir)
    # APPENDXLSX(o_dir)

其他应用

正则表达式替换字符串

'''
处理文件结构
DIR (一级文件夹) 
    |DIR1(二级文件夹) 
        |FILE11(文件) 
        |FILE12
运行格式: python XXX.py "F:\\01data\\方位数据\\"
'''

import os
import re
import shutil
from pandas import DataFrame
import pandas as pd
import sys
from tqdm import tqdm


text1 = re.compile("\[2022-05-25 \d{2}:\d{2}:\d{2}.\d{3}\]# RECV HEX>")# [2022-05-25 15:26:28.528]# RECV HEX>

o_dir = sys.argv[1]

dirs = os.listdir(o_dir) #文件夹
for i in tqdm(dirs):  # 遍历所有当前路径下的文件夹
    # print(i)  
    absolutepath = o_dir + i+"\\"  #得到文件夹所在的绝对路径
    dirfiles = os.listdir(absolutepath)#文件夹下的内容

    # dirs.sort(key=lambda x: int(x.split("-")[2]))#排序
    for dir in tqdm(dirfiles):
        if dir.find(".log") != -1:
            name = absolutepath+"\\"+dir.split(".log")[0]
            #读取log文件并替换
            with open(name + ".log", 'r',encoding='UTF-8') as file:
                data = file.read()
                # data = data.replace(search_text, replace_text)
                data = text1.sub('\n',data) #将data中的字符串按照text1的正则表达式替换为\n
                file.close()

            #存为txt文件
            with open(name + ".txt", 'w',encoding='UTF-8') as fl:
                fl.write(data)
                fl.close()

        else:
            pass