pandas分析电影评分

203 阅读1分钟
import pandas
import math
import numpy

#pandas要读取表格型内容  如:

#pandas.read_sql()
#pandas.read_excel()

u_cols1=['users_id','age','sex','job','zip_code']
users_info=pandas.read_csv('uers_info.txt', sep='|', names=u_cols1, encoding='ISO-8859-1')
#print(users_info)

u_cols2=['users_id','movie_id','rating','unix_timestamp']
ratings=pandas.read_csv('u.data.txt',sep='\t',names=u_cols2,encoding='ISO-8859-1')
#print(ratings.head())
u_cols3=['movie_id','title','release_data','imdb_url']
movies=pandas.read_csv('u.item',sep='|',names=u_cols3,usecols=range(4),encoding='ISO-8859-1')
#print(movies.head())
#合并
data = pandas.merge(pandas.merge(users_info, ratings), movies)
#print(data.head())
#print('用户1看过的所有电影及评分', data[data.users_id == 1])
ratings_by_gender = data.pivot_table(values='rating', index='title', columns='sex', aggfunc='mean')
#print(ratings_by_gender)
ratings_by_gender['difference'] = abs(ratings_by_gender.F - ratings_by_gender.M)
sort1=ratings_by_gender.sort_values(by='difference', ascending=False)
sort2=ratings_by_gender.sort_values(by='difference', ascending=True)
print('男女评分分歧最小的5部电影')
print(sort2[:5])
print('男女评分分歧最大的5部电影')
print(sort1[:5])
ratings_by_allusers= data.pivot_table(values='rating', index='title', aggfunc='var')
#计算方差
#print(ratings_by_gender1)
# ratings_by_gender1['variance']=ratings_by_gender1.mean()
# ratings_by_gender1['variance']=ratings_by_gender1.var()
#
sort3=ratings_by_allusers['rating'].sort_values(ascending=True)
print('评分分歧最小的5部电影')
print(sort3[:5])
sort4=ratings_by_allusers['rating'].sort_values(ascending=False)
print('评分分歧最大的5部电影')
print(sort4[:5])