过采样的自己实现

78 阅读1分钟
import numpy as np
import pandas as pd

def random_oversample(X, y, sampling_rate):
    """
    随机过采样函数,通过复制少数类别的样本来实现。
    参数:
    X -- 特征矩阵
    y -- 目标变量
    sampling_rate -- 目标类别的采样率
    返回:
    X_resampled -- 过采样后的特征矩阵
    y_resampled -- 过采样后的目标变量
    """
    # 确定类别和它们的数量
    unique_classes, class_counts = np.unique(y, return_counts=True)
    
    # 计算每个类别需要复制的次数
    class_to_sample = unique_classes[class_counts < sampling_rate]
    samples_to_add = {c: (sampling_rate - class_counts[class_counts == c][0]) for c in class_to_sample}
    
    # 复制样本
    for class_value, count in samples_to_add.items():
        indices_to_repeat = np.where(y == class_value)[0]
        repeats = np.random.choice(indices_to_repeat, size=count, replace=True)
        X_resampled = np.vstack((X_resampled, X[repeats]))
        y_resampled = np.concatenate((y_resampled, y[repeats]))
    
    return X_resampled, y_resampled

# 示例数据
X = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]])
y = np.array([0, 0, 1, 1, 1])

# 过采样
X_resampled, y_resampled = random_oversample(X, y, sampling_rate=3)

# 转换为DataFrame
df_X_resampled = pd.DataFrame(X_resampled, columns=['Feature1', 'Feature2'])
df_y_resampled = pd.DataFrame(y_resampled, columns=['Target'])

# 查看过采样后的数据
print(df_X_resampled)
print(df_y_resampled)