Python词频统计

371 阅读1分钟

比较坎坷,网上资料都没法完全满足自己的需求

# -*- coding:utf-8 -*-  
import time
from collections import Counter
import re
import jieba  
import jieba.analyse  

txt = open('abc.txt').read()
result = Counter()
seg_list = jieba.cut(txt, cut_all = False)
re_gex = u'[\u4E00-\u9FA5]+'
for e in seg_list:
    match_obj = re.search(re_gex,e)
    if (match_obj and len(e)>1):
        print e
        result[e] = result[e] + 1
    else:
        print '<------------>'
#以词频倒序 
print '*************************'
arr = result.most_common(10)#取出前十
for a in arr:
    print 'key=%s,value=%s' %(a[0],a[1])

结果: key=陈笑,value=65 key=叶铮,value=48 key=白衣,value=14 key=大将军,value=14 key=皇子,value=13 key=对方,value=13 key=有些,value=12 key=元帅,value=11 key=自己,value=11 key=这个,value=9