python效率优化

1,243 阅读1分钟

python版本

该版本较老师 python 版本,提升了约10倍。

def target_mean_py_v1(data,y_name,x_name):
    result = np.zeros(0)
    groupby = data.groupby(x_name)
    sum_dict = groupby.agg(['sum'])[y_name].to_dict()['sum']
    count_dict = groupby.count()[y_name].to_dict()
    for y, x in zip(data[y_name], data[x_name]):
        result = np.append(result, ((sum_dict[x] - y) / (count_dict[x] - 1)))
    return result

cython版本

版本一

该版本较老师 cython 版本,只有60%左右的效率。

cpdef target_mean_cy_v1(data, y_name, x_name):
    cdef long nrow = data.shape[0]
    cdef np.ndarray[double] result = np.asfortranarray(np.zeros(nrow), dtype=np.float64)
    cdef np.ndarray[int] y = np.asfortranarray(data[y_name], dtype=np.int)
    cdef np.ndarray[int] x = np.asfortranarray(data[x_name], dtype=np.int)

    groupby = data.groupby(x_name)
    cdef dict sum_dict = groupby.agg(['sum'])[y_name].to_dict()['sum']
    cdef dict count_dict = groupby.count()[y_name].to_dict()

    cdef long i
    for i in range(nrow):
        result[i] = (sum_dict[x[i]] - y[i])/(count_dict[x[i]]-1)
    return result

版本二

该版本较老师 cython 版本,提升了约2~3倍。

cpdef target_mean_cy_v2(data, y_name, x_name):
    cdef long nrow = data.shape[0]
    cdef np.ndarray[double] result = np.asfortranarray(np.zeros(nrow), dtype=np.float64)
    cdef np.ndarray[double] y = np.asfortranarray(data[y_name], dtype=np.float64)
    cdef np.ndarray[int] x = np.asfortranarray(data[x_name], dtype=np.int)

    cdef dict value_dict = dict()
    cdef dict count_dict = dict()
    
    cdef long i
    for i in range(nrow):
        if x[i] not in value_dict.keys():
            value_dict[x[i]] = y[i]
            count_dict[x[i]] = 1
        else:
            value_dict[x[i]] += y[i]
            count_dict[x[i]] += 1
    i=0
    for i in range(nrow):
        result[i] = (value_dict[x[i]] - y[i])/(count_dict[x[i]]-1)