NLP之jieba分词fromsklearn.feature_extraction.textimportCountVec

from sklearn.feature_extraction.text import CountVectorizer
# 实例化CountVectorizer
vector = CountVectorizer()
# 调用fit_transform输入并转换数据
res = vector.fit_transform(["life is is short, i like python", "life is too long, i dislike python"])
# 打印结果
print(vector.get_feature_names())

结果：
['dislike', 'is', 'life', 'like', 'long', 'python', 'short', 'too']

print(res.toarray())

结果：
[[0 2 1 1 0 1 1 0]
[1 1 1 0 1 1 0 1]]

def dictvec():
"""
字典数据抽取
"""
# 实例化
dict = DictVectorizer(sparse=False)
data = dict.fit_transform([{'city': '北京','temperature': 100}, {'city': '上海','temperature':60}, {'city': '深圳','temperature': 30}])
print(dict.get_feature_names())
print("----")
print(dict.inverse_transform(data))
print("----")
print(data)
print("-----")
return None

dictvec()

结果：
['city=上海', 'city=北京', 'city=深圳', 'temperature']
----
[{'temperature': 100.0, 'city=北京': 1.0}, {'city=上海': 1.0, 'temperature': 60.0}, {'temperature': 30.0, 'city=深圳': 1.0}]
----
[[ 0. 1. 0. 100.]
[ 1. 0. 0. 60.]
[ 0. 0. 1. 30.]]
-----

def countervec():
"""
对文本进行特征值化
:return None
"""
cv = CountVectorizer()
data = cv.fit_transform(["人生人生苦短，我喜欢 python", "人生漫长，不用 python"])
print(cv.get_feature_names())
print("----")
print(data.toarray())
print("-----")
return None

countervec()

结果：
['python', '不用', '人生', '人生漫长', '喜欢', '苦短']
----
[[1 0 2 0 1 1]
[1 1 0 1 0 0]]
-----

import jieba
con5 = jieba.lcut("今天很残酷，明天更残酷，后天很美好，但绝对大部分是死在明天晚上，所以每个人不要放弃今天。")
# type(con5) # list
print("con5:", con5)
con5_join = " ".join(con5)
print("-------")
print(con5_join)
print("*")
print(type(con5_join))

结果：
con5: ['今天', '很', '残酷', '，', '明天', '更', '残酷', '，', '后天', '很', '美好', '，', '但', '绝对', '大部分', '是', '死', '在', '明天', '晚上', '，', '所以', '每个', '人', '不要', '放弃', '今天', '。']
-------
今天很残酷，明天更残酷，后天很美好，但绝对大部分是死在明天晚上，所以每个人不要放弃今天。
*
<class 'str'>

con6 = jieba.cut("今天很残酷，明天更残酷，后天很美好，但绝对大部分是死在明天晚上，所以每个人不要放弃今天。")
join_str = " ".join(con6)
print(join_str)
print(type(join_str))

结果：
今天很残酷，明天更残酷，后天很美好，但绝对大部分是死在明天晚上，所以每个人不要放弃今天。
<class 'str'>

更多学习资料可关注：gzitcast