import numpy as np
import pandas as pd
def random_oversample(X, y, sampling_rate):
"""
随机过采样函数,通过复制少数类别的样本来实现。
参数:
X -- 特征矩阵
y -- 目标变量
sampling_rate -- 目标类别的采样率
返回:
X_resampled -- 过采样后的特征矩阵
y_resampled -- 过采样后的目标变量
"""
unique_classes, class_counts = np.unique(y, return_counts=True)
class_to_sample = unique_classes[class_counts < sampling_rate]
samples_to_add = {c: (sampling_rate - class_counts[class_counts == c][0]) for c in class_to_sample}
for class_value, count in samples_to_add.items():
indices_to_repeat = np.where(y == class_value)[0]
repeats = np.random.choice(indices_to_repeat, size=count, replace=True)
X_resampled = np.vstack((X_resampled, X[repeats]))
y_resampled = np.concatenate((y_resampled, y[repeats]))
return X_resampled, y_resampled
X = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]])
y = np.array([0, 0, 1, 1, 1])
X_resampled, y_resampled = random_oversample(X, y, sampling_rate=3)
df_X_resampled = pd.DataFrame(X_resampled, columns=['Feature1', 'Feature2'])
df_y_resampled = pd.DataFrame(y_resampled, columns=['Target'])
print(df_X_resampled)
print(df_y_resampled)