python版本
该版本较老师 python 版本,提升了约10倍。
def target_mean_py_v1(data,y_name,x_name):
result = np.zeros(0)
groupby = data.groupby(x_name)
sum_dict = groupby.agg(['sum'])[y_name].to_dict()['sum']
count_dict = groupby.count()[y_name].to_dict()
for y, x in zip(data[y_name], data[x_name]):
result = np.append(result, ((sum_dict[x] - y) / (count_dict[x] - 1)))
return result
cython版本
版本一
该版本较老师 cython 版本,只有60%左右的效率。
cpdef target_mean_cy_v1(data, y_name, x_name):
cdef long nrow = data.shape[0]
cdef np.ndarray[double] result = np.asfortranarray(np.zeros(nrow), dtype=np.float64)
cdef np.ndarray[int] y = np.asfortranarray(data[y_name], dtype=np.int)
cdef np.ndarray[int] x = np.asfortranarray(data[x_name], dtype=np.int)
groupby = data.groupby(x_name)
cdef dict sum_dict = groupby.agg(['sum'])[y_name].to_dict()['sum']
cdef dict count_dict = groupby.count()[y_name].to_dict()
cdef long i
for i in range(nrow):
result[i] = (sum_dict[x[i]] - y[i])/(count_dict[x[i]]-1)
return result
版本二
该版本较老师 cython 版本,提升了约2~3倍。
cpdef target_mean_cy_v2(data, y_name, x_name):
cdef long nrow = data.shape[0]
cdef np.ndarray[double] result = np.asfortranarray(np.zeros(nrow), dtype=np.float64)
cdef np.ndarray[double] y = np.asfortranarray(data[y_name], dtype=np.float64)
cdef np.ndarray[int] x = np.asfortranarray(data[x_name], dtype=np.int)
cdef dict value_dict = dict()
cdef dict count_dict = dict()
cdef long i
for i in range(nrow):
if x[i] not in value_dict.keys():
value_dict[x[i]] = y[i]
count_dict[x[i]] = 1
else:
value_dict[x[i]] += y[i]
count_dict[x[i]] += 1
i=0
for i in range(nrow):
result[i] = (value_dict[x[i]] - y[i])/(count_dict[x[i]]-1)