机器学习基础编程机器学习和cv学习，机器学习与计算机视觉结合编程，通过Python调用Scikit-Learn、Tens

#####CV

1. Dataset构建

import os # 导入操作系统模块，用于文件路径操作

import random # 导入随机数模块，用于数据打乱

from PIL import Image # 导入图像处理库，用于读取和处理图像

import matplotlib.pyplot as plt # 导入绘图库，用于可视化

import matplotlib as mpl # 导入matplotlib配置模块，用于设置绘图参数

import numpy as np # 导入数值计算库，用于数组操作和数学计算

from collections import defaultdict # 导入默认字典，用于统计

from pathlib import Path # 导入路径处理模块，用于跨平台路径操作

from matplotlib.font_manager import FontProperties # 导入字体管理模块，用于设置中文字体

from tqdm import tqdm # 导入进度条模块，用于显示训练进度

from torch.utils.data import Dataset, Subset # 导入PyTorch数据集相关模块

from torch.utils.data import DataLoader # 导入数据加载器，用于批量加载数据

1.1 获取图片信息

1.2 构建Dataset

class GarbageDataset4(Dataset):

cls_num = 4 # 设置分类数量为4类

def init(self, root_dir_, transform=None, class_names_path=None):

self.root_dir = root_dir_ # 存储数据根目录

self.transform = transform # 存储图像预处理变换

self.img_info = [] # 存储图像信息列表，格式为[(path, label), ...]

self.label_array = None # 存储标签数组

# 检查并加载类别名称文件

if class_names_path:

self.class_names = self.load_class_names(class_names_path)

else:

raise Exception("class_names_path is empty !!") # 如果类别名称文件路径为空，抛出异常

self.map_dict = self.generate_mapping_dict() # 生成类别映射字典

self.class_names = "厨余垃圾可回收物其他垃圾有害垃圾".split() # 设置4分类的类别名称 self._get_img_info() # 获取图像信息

@staticmethod

def load_class_names(class_file_path):

if not os.path.exists(class_file_path): # 检查文件是否存在

print(f"警告: 类别名称文件 '{class_file_path}' 不存在，将使用类别ID作为名称")

return None

# 读取类别名称文件

with open(class_file_path, 'r', encoding='utf-8') as f:

class_names = [line.strip() for line in f.readlines() if line.strip()] # 去除空行和空白字符 return class_names

###########################################考试押题这里开始

def getitem(self, index):

path_img, label = self.img_info[index] # 获取图像路径和标签

img = Image.open(path_img).convert('RGB') # 打开图像并转换为RGB格式

# 对图像进行预处理，通常采用torchvision.transforms对象

if self.transform is not None:

img = self.transform(img) # 应用图像变换

return img, label, path_img # 返回处理后的图像、标签和路径

##########################################考试押题这里结束

def len(self): if len(self.img_info) == 0: # 检查数据集是否为空

raise Exception("\ndata_dir:{} is a empty dir! Please checkout your path to images!".format(

self.root_dir)) # 代码具有友好的提示功能，便于debug

return len(self.img_info) # 返回图像信息列表的长度

def _get_img_info(self):

self.img_info = [] # 初始化图像信息列表

def get_image_files(root_dir, img_extensions=None):

if img_extensions is None:

img_extensions = {'.jpg', '.jpeg', '.png', '.bmp', '.gif', '.webp', '.tiff'} # 支持的图像格式

image_files = [] # 存储图像文件路径

for root, _, files in os.walk(root_dir): # 递归遍历目录

for file in files: # 遍历文件

ext = os.path.splitext(file)[1].lower() # 获取文件扩展名并转为小写

if ext in img_extensions: # 检查是否为图像文件

image_files.append(os.path.join(root, file)) # 添加图像文件路径

return image_files

# 获取图片路径

image_files = get_image_files(self.root_dir) # 获取所有图像文件路径

# 获取图片标签

image_label = [int(os.path.basename(os.path.dirname(p_))) for p_ in image_files] # 从文件夹名获取原始标签

image_label = [int(self.map_dict[label265]) for label265 in image_label] # 转换为4分类标签

# 存储图片路径、标签信息

self.img_info = [(img_path, image_label) for img_path, image_label in zip(image_files, image_label)]

print(f"成功加载 {len(self.img_info)} 张图像，共 {len(set([info[1] for info in self.img_info]))} 个类别")

return self.img_info

def generate_mapping_dict(self):

prefix_mapping = { # 定义前缀映射关系

"厨余垃圾": 0, # 厨余垃圾映射为0

"可回收物": 1, # 可回收物映射为1

# "可回收垃圾": 1, # 注释掉的映射

"其它垃圾": 2, # 其它垃圾映射为2

"其他垃圾": 2, # 其他垃圾映射为2

"有害垃圾": 3 # 有害垃圾映射为3

}

# 根据类别名称生成新的类别列表

new_class_list = [prefix_mapping[name.split("-")[0]] for name in self.class_names]

# 生成原始类别列表（0到类别数量-1）

old_class_list = [int(i) for i in range(len(self.class_names))]

# 创建映射字典，将0-264映射到0-3

mapping_dict = dict(zip(old_class_list, new_class_list))

return mapping_dict

1.3 加入预处理模块transforms

2. 构建DataLoader（库：DataLoader）

#### ！！！数据增强方法（库：Normalize/Resize） import torchvision.transforms as transforms # 导入图像变换模块

train_bs = 64 # 设置训练批次大小为64

workers = 0 # notebook中设置为0，避免多进程问题

data_root_dir = r"/root/autodl-tmp/data/garbage265" # 数据根目录路径

step0: 配置训练和验证数据目录

train_dir = os.path.join(data_root_dir, "train") # 训练数据目录

valid_dir = os.path.join(data_root_dir, "val") # 验证数据目录

###########################################考试押题这里开始

step1：数据集配置

cls_names_path = os.path.join(train_dir, "classname.txt") # 类别名称文件路径

# 设置图像归一化参数（ImageNet数据集的统计值）

norm_mean = [0.485, 0.456, 0.406] # imagenet 120万图像统计得来的均值

norm_std = [0.229, 0.224, 0.225] # imagenet 120万图像统计得来的标准差

normTransform = transforms.Normalize(norm_mean, norm_std) # 创建归一化变换

# 训练数据增强变换

transforms_train = transforms.Compose([

transforms.Resize((256)), # 调整图像大小，最短边为256像素

transforms.CenterCrop(256), # 中心裁剪到256x256

transforms.RandomCrop(224), # 随机裁剪到224x224

transforms.RandomHorizontalFlip(p=0.5), # 随机水平翻转，概率为0.5

transforms.ToTensor(), # 转换为张量

normTransform, # 应用归一化

])

# 验证数据变换（无数据增强）

transforms_valid = transforms.Compose([

transforms.Resize((224, 224)), # 调整图像大小到224x224

transforms.ToTensor(), # 转换为张量

normTransform, # 应用归一化

])

#########################################考试押题这里结束

# 创建训练数据集

train_data_full = GarbageDataset4(root_dir_=train_dir, transform=transforms_train, class_names_path=cls_names_path)

# 创建验证数据集

valid_data = GarbageDataset4(root_dir_=valid_dir, transform=transforms_valid, class_names_path=cls_names_path)

# 选择部分训练数据进行训练（10%）

num_train = int(len(train_data_full) * 0.1) # 计算训练样本数量

indices = list(range(num_train)) # 创建索引列表

train_data_full = Subset(train_data_full, indices) # 根据配置参数，选择训练集数量

# 创建数据加载器

train_loader = DataLoader(dataset=train_data_full, batch_size=train_bs, shuffle=True, num_workers=workers) # 训练数据加载器

valid_loader = DataLoader(dataset=valid_data, batch_size=train_bs, num_workers=workers) # 验证数据加载器

print(next(iter(train_data_full)))

3. 模型搭建

3.1 模型定义

import timm

import torch

import torch.nn as nn

#### ！！！timm库分类模型加载与分类头修改（timm.create_model/）

###########################################考试押题这里开始

# 模型加载和分类头修改

data_dir = os.getcwd() # 获取当前工作目录

path_resnet18 = os.path.join(data_dir, "data", "pretrained_model", "resnet18-5c106cde.pth") # ResNet18预训练权重路径

model = timm.create_model('resnet18', checkpoint_path=path_resnet18) # 创建ResNet18模型并加载预训练权重

num_ftrs = model.fc.in_features # 获取全连接层的输入特征数

model.fc = nn.Linear(num_ftrs, 265) # 修改分类头为265类输出

###########################################考试押题这里结束

4. 损失函数与优化器

#### ！！！完成循环训练（必考CosineAnnealingLR/CrossEntropyLoss/SGD/Parameters）

###########################################考试押题这里开始

import torch.optim as optim # 导入优化器模块

from torch.optim.lr_scheduler import CosineAnnealingLR # 导入余弦退火学习率调度器

lr_init = 0.01 # 初始学习率

max_epoch = 5 # 最大训练轮数

loss_f = nn.CrossEntropyLoss() # 交叉熵损失函数

optimizer = optim.SGD(model.parameters(), lr=lr_init, momentum=0.9, weight_decay=1e-4) # SGD优化器

scheduler = CosineAnnealingLR(optimizer, T_max=max_epoch, eta_min=lr_init*0.01) # 余弦退火学习率调度器

###########################################考试押题这里结束

5. 迭代训练

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix # 导入评估指标

device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 设置设备（GPU或CPU）

model.to(device) # 将模型移动到指定设备

for epoch_idx in range(max_epoch): # 训练循环

print(f"epoch:{epoch_idx}/{max_epoch}") # 打印当前轮数

# 训练阶段

all_preds = [] # 存储所有预测结果

all_labels = [] # 存储所有真实标签

train_loss = [] # 存储训练损失

for i, data in tqdm(enumerate(train_loader), desc='training: '): # 遍历训练数据

inputs, labels, path_imgs = data # 解包数据

inputs, labels = inputs.to(device), labels.to(device) # 将数据移动到设备

print("inputs:{}".format(inputs.shape)) # 打印输入张量形状

print("labels:{}".format(labels.shape)) # 打印标签张量形状

####################################################################考试押题这里开始

# 前向传播

outputs = model(inputs) # 模型前向传播，输出形状为[64, 265]

print("outputs:{}".format(outputs.shape)) # 打印输出张量形状

optimizer.zero_grad() # 清空梯度

_, predicted = torch.max(outputs.data, 1) # 获取预测类别

loss = loss_f(outputs.cpu(), labels.cpu()) # 计算损失

all_preds.extend(predicted.cpu().numpy()) # 收集预测结果

all_labels.extend(labels.cpu().numpy()) # 收集真实标签

loss.backward() # 反向传播

optimizer.step() # 更新参数

train_loss.append(loss.item()) # 记录损失值

break # 只训练一个批次就退出（用于调试）

####################################################################考试押题这里结束

# 计算训练指标

acc = accuracy_score(all_labels, all_preds) # 计算准确率

cr = classification_report(all_labels, all_preds) # 生成分类报告

conf_mx = confusion_matrix(all_labels, all_preds) # 生成混淆矩阵

print("train acc:{:.2f}".format(np.mean(acc))) # 打印训练准确率

print(cr) # 打印分类报告

print(conf_mx) # 打印混淆矩阵

break # 只训练一个epoch就退出（用于调试）

# 验证阶段（被break跳过）

valid_loss = []

for i, data in tqdm(enumerate(valid_loader), desc='valiate: '):

inputs, labels, path_imgs = data

inputs, labels = inputs.to(device), labels.to(device)

# 前向传播

outputs = model(inputs)

loss = loss_f(outputs.cpu(), labels.cpu())

valid_loss.append(loss.item())

# 学习率调度器更新

scheduler.step() # 每个epoch之后调用一次

print("train loss:{:.2f}, valid loss:{:.2f}".format(np.mean(train_loss), np.mean(valid_loss)))

####################################################################也有可能考试押题这里结束

#（create_model）

####################################################################考试押题这里开始

# 加载预训练模型进行预测

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report, accuracy_score

mapping_dict = valid_data.map_dict # 获取265类映射为4类的字典

device = "cpu" # 设置设备为CPU

data_dir = os.getcwd() # 获取当前工作目录

path_ckpt = os.path.join(data_dir, "data", "pretrained_model", "resnet18-5c106cde.pth") # 预训练权重路径

model = timm.create_model('resnet18', checkpoint_path=path_resnet18) # 创建模型

ckpt = torch.load(path_ckpt, map_location=device, weights_only=True) # 加载预训练权重

model.load_state_dict(ckpt) # 加载预训练模型权重

num_ftrs = model.fc.in_features # 获取全连接层输入特征数

model.fc = nn.Linear(num_ftrs, 265) # 修改分类头为265类

model.to(device) # 将模型移动到设备

model.eval() # 设置为评估模式

####################################################################考试押题这里结束

# 推理

all_preds = [] # 存储所有预测结果

all_labels = [] # 存储所有真实标签

for i, data in tqdm(enumerate(valid_loader), total=len(valid_loader)): # 遍历验证数据

inputs, labels, path_imgs = data # 解包数据

inputs, labels = inputs.to(device), labels.to(device) # 将数据移动到设备

outputs = model(inputs) # 模型前向传播

# 记录结果

_, predicted = torch.max(outputs.data, 1) # 获取预测类别

all_preds.extend(predicted.cpu().numpy()) # 收集预测结果

all_labels.extend(labels.cpu().numpy()) # 收集真实标签

#### ！！！265类分类模型迁移到4分类的标签映射中

####################################################################考试押题这里开始

# 计算265分类的评估指标

acc = accuracy_score(all_labels, all_preds) # 计算准确率

cr = classification_report(all_labels, all_preds) # 生成分类报告

conf_mx = confusion_matrix(all_labels, all_preds) # 生成混淆矩阵

print("val acc:{:.2f}".format(np.mean(acc))) # 打印验证准确率

print(cr) # 打印分类报告

print(conf_mx) # 打印混淆矩阵

#### ！！！混淆矩阵统计与指标分析

# 统计结果 - 将265类映射到4类进行评估

class_names_4 = "厨余垃圾可回收垃圾其他垃圾有害垃圾".split() # 4分类的类别名称

all_labels_4 = [mapping dict[i] for i in all labels] # 将265类预测结果映射到4类

all_preds_4 = [mapping_dict[i] for i in all_preds]

report = classification_report(all_labels_4, all_preds_4, target_names=class_names_4, digits=4) # 生成4分类报告

print("\n各类别评估指标:") # 打印标题

print(report) # 打印4分类评估报告

####################################################################考试押题这里结束

机器学习################################################################################################

# 二、监督学习

1.分类任务

import pandas as pd # 导入pandas库，用于数据处理和分析

1. 数据加载与预处理

# 读取训练数据和测试数据

train = pd.read_csv('20250719机器学习编程实操代码-02信用评分卡（王贺）/train.csv') # 加载训练数据集

test = pd.read_csv('20250719机器学习编程实操代码-02信用评分卡（王贺）/test.csv') # 加载测试数据集

2.处理缺失值 - 只对分类特征进行简单填充

# 将Credit_Product列的缺失值填充为'Unknown'

train['Credit_Product'] = train['Credit_Product'].fillna('Unknown') # 用'Unknown'填充训练集中Credit_Product列的缺失值

test['Credit_Product'] = test['Credit_Product'].fillna('Unknown') # 用'Unknown'填充测试集中Credit_Product列的缺失值

# 分离特征和目标变量

# 训练数据中删除ID列和Target列作为特征

X_train = train.drop(['ID', 'Target'], axis=1) # 从训练数据中删除ID列和Target列，保留特征列

# 提取目标变量（客户是否为好客户）

y_train = train['Target'] # 提取训练数据中的目标变量（标签）

3. 特征预处理

# 定义分类特征和数值特征

# 分类特征列表

cat_cols = ['Gender', 'Region_Code', 'Occupation', 'Channel_Code', 'Credit_Product', 'Is_Active']

# 数值特征列表

num_cols = ['Age', 'Vintage', 'Avg_Account_Balance']

label = 'Target'

features = cat_cols+num_cols

##（sklearn/CatBoost/LGBM/XGBBoost）

##（onehot/Classifier）

from sklearn.model_selection import train_test_split, StratifiedKFold, KFold, RepeatedKFold # 导入数据分割和交叉验证相关函数

from sklearn.linear_model import LogisticRegression # 导入逻辑回归模型

from sklearn.tree import DecisionTreeClassifier # 导入决策树分类器

from sklearn.ensemble import RandomForestClassifier # 导入随机森林分类器

from xgboost import XGBClassifier # 导入XGBoost分类器

from lightgbm import LGBMClassifier # 导入LightGBM分类器

from catboost import CatBoostClassifier # 导入CatBoost分类器

from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, classification_report, confusion_matrix # 导入评估指标

from sklearn.datasets import make_classification # 导入生成分类数据的函数

from sklearn.preprocessing import StandardScaler, OneHotEncoder # 导入数据预处理工具

import matplotlib as plt # 导入matplotlib绘图库

import numpy as np # 导入numpy数值计算库

import time # 导入时间模块，用于计时

# 定义模型列表

models = { # 创建模型字典，包含各种机器学习算法

"Logistic Regression": LogisticRegression(), # 逻辑回归模型

"Decision Tree": DecisionTreeClassifier(random_state=42), # 决策树模型，设置随机种子

"Random Forest": RandomForestClassifier(random_state=42), # 随机森林模型，设置随机种子

"XGBoost": XGBClassifier( # XGBoost模型，设置超参数

n_estimators=30, # 树的数量

max_depth=6, # 树的最大深度

learning_rate=0.05, # 学习率

subsample=0.8, # 样本采样比例

colsample_bytree=0.8, # 特征采样比例

random_state=42, # 随机种子

n_jobs=-1, # 使用所有CPU核心

early_stopping_round=10#需要的话就写上

"LightGBM": LGBMClassifier( # LightGBM模型，设置超参数

n_estimators=30, # 树的数量

max_depth=6, # 树的最大深度

learning_rate=0.05, # 学习率

subsample=0.8, # 样本采样比例

colsample_bytree=0.8, # 特征采样比例

random_state=42, # 随机种子

n_jobs=-1 # 使用所有CPU核心

"CatBoost": CatBoostClassifier( # CatBoost模型，设置超参数

iterations=30, # 迭代次数

depth=6, # 树的最大深度

learning_rate=0.05, # 学习率

random_seed=42, # 随机种子

verbose=0, # 不显示训练过程

thread_count=-1 # 使用所有CPU核心

)

}

# 模型性能记录

results = { # 创建结果字典，用于存储各模型的性能指标

"Model": [], # 模型名称列表

"AUC": [], # AUC分数列表

"Time (s)": [], # 训练时间列表

"Parameters": [] # 模型参数列表

}

# 假设你已经分离了特征和标签

results = {"Model": [], "AUC": [], "F1": [], "ACC": [], "Time (s)": [], "Parameters": []} # 重新初始化结果字典，增加F1和ACC指标

kfold = KFold(n_splits=5, shuffle=True, random_state=42) # 创建5折交叉验证对象，打乱数据并设置随机种子

for name, model in models.items(): # 遍历每个模型

print(f"\nTraining {name}...") # 打印当前训练的模型名称

start_time = time.time() # 记录训练开始时间

# 用于收集每折的评估结果

auc_scores = [] # 存储每折的AUC分数

f1_scores = [] # 存储每折的F1分数

acc_scores = [] # 存储每折的准确率分数

y_true_all = [] # 存储所有真实标签

y_pred_all = [] # 存储所有预测标签

# �� K 折循环

for train_index, val_index in kfold.split(X_train): # 进行5折交叉验证

# === 1. 划分数据 ===

X_train_fold = X_train.iloc[train_index] # 获取当前折的训练特征

X_val_fold = X_train.iloc[val_index] # 获取当前折的验证特征

y_train_fold = y_train.iloc[train_index] # 获取当前折的训练标签

y_val_fold = y_train.iloc[val_index] # 获取当前折的验证标签

# === 2. 数值特征标准化（在当前训练折上 fit）===

scaler = StandardScaler() # 创建标准化器

X_train_num = scaler.fit_transform(X_train_fold[num_cols]) # 对训练集数值特征进行标准化

X_val_num = scaler.transform(X_val_fold[num_cols]) # 用训练折的参数对验证集数值特征进行标准化

# === 3. 分类特征独热编码（在当前训练折上 fit）===

encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False) # 创建独热编码器，忽略未知类别，输出密集矩阵

X_train_cat = encoder.fit_transform(X_train_fold[cat_cols]) # 对训练集分类特征进行独热编码

X_val_cat = encoder.transform(X_val_fold[cat_cols]) # 用训练折的参数对验证集分类特征进行独热编码

# === 4. 合并特征 ===

X_train_processed = np.hstack([X_train_num, X_train_cat]) # 水平堆叠数值特征和编码后的分类特征

X_val_processed = np.hstack([X_val_num, X_val_cat]) # 水平堆叠数值特征和编码后的分类特征

# === 5. 训练模型（建议 clone，防止状态污染）===

# from sklearn.base import clone

# model_clone = clone(model)

# model_clone.fit(X_train_processed, y_train_fold)

model.fit(X_train_processed, y_train_fold) # 训练模型

# === 6. 预测 ===

y_pred_proba = model.predict_proba(X_val_processed)[:, 1] # 获取正类的预测概率

y_pred = model.predict(X_val_processed) # 获取预测标签

# === 7. 计算指标 ===

auc_scores.append(roc_auc_score(y_val_fold, y_pred_proba)) # 计算AUC分数并添加到列表

f1_scores.append(f1_score(y_val_fold, y_pred)) # 计算F1分数并添加到列表

acc_scores.append(accuracy_score(y_val_fold, y_pred)) # 计算准确率并添加到列表

y_true_all.extend(y_val_fold) # 将真实标签添加到总列表

y_pred_all.extend(y_pred) # 将预测标签添加到总列表

# === 8. 汇总 K 折结果 ===

auc = np.mean(auc_scores) # 计算平均AUC分数

f1 = np.mean(f1_scores) # 计算平均F1分数

acc = np.mean(acc_scores) # 计算平均准确率

cr = classification_report(y_true_all, y_pred_all) # 生成分类报告

conf_mx = confusion_matrix(y_true_all, y_pred_all) # 生成混淆矩阵

total_time = time.time() - start_time # 计算总训练时间

# === 9. 存储结果（每个模型只记录一次）===

results["Model"].append(name) # 添加模型名称到结果字典

results["AUC"].append(auc) # 添加AUC分数到结果字典

results["F1"].append(f1) # 添加F1分数到结果字典

results["ACC"].append(acc) # 添加准确率到结果字典

results["Time (s)"].append(total_time) # 添加训练时间到结果字典

results["Parameters"].append(str(model.get_params())) # 添加模型参数到结果字典

# === 10. 打印结果 ===

print(f"{name} - AUC: {auc:.4f}, F1: {f1:.4f}, ACC: {acc:.4f}, Time: {total_time:.2f}s") # 打印模型性能指标

print("\n分类报告:") # 打印分类报告标题

print(cr) # 打印详细的分类报告

print("\n混淆矩阵:") # 打印混淆矩阵标题

print(conf_mx) # 打印混淆矩阵

Stacking

from sklearn.ensemble import StackingClassifier # 导入堆叠分类器

from sklearn.linear_model import LogisticRegression # 导入逻辑回归作为元模型

# 定义基模型

base_models = [ # 创建基础模型列表

('xgb', xgb.XGBClassifier(n_estimators=30, max_depth=5, learning_rate=0.05, random_state=42)), # XGBoost基础模型

('lgb', lgb.LGBMClassifier(n_estimators=30, max_depth=5, learning_rate=0.05, random_state=42, verbose=-1)), # LightGBM基础模型

('cat', cb.CatBoostClassifier(iterations=30, depth=5, learning_rate=0.05, random_seed=42, verbose=0)) # CatBoost基础模型

]

# 定义元模型

meta_model = LogisticRegression(C=0.1, max_iter=1000, random_state=42) # 创建逻辑回归元模型

# 创建Stacking分类器

stacking_model = StackingClassifier( # 创建堆叠分类器

estimators=base_models, # 基础模型列表

final_estimator=meta_model, # 元模型

cv=5, # 5折交叉验证

n_jobs=12 # 使用12个CPU核心

)

# 训练Stacking模型

print("Training Stacking Model...") # 打印训练开始信息

start_time = time.time() # 记录训练开始时间

stacking_model.fit(X_train_final, y_train) # 训练堆叠模型

train_time = time.time() - start_time # 计算训练时间

# 验证集评估

y_pred_stacking = stacking_model.predict_proba(X_val_final)[:, 1] # 获取堆叠模型在验证集上的预测概率

auc_stacking = roc_auc_score(y_val, y_pred_stacking) # 计算堆叠模型的AUC分数

print(f"Stacking融合AUC: {auc_stacking:.4f}, Time: {train_time:.2f}s") # 打印堆叠模型的性能指标

from sklearn.linear_model import ElasticNet, BayesianRidge # 导入弹性网络和贝叶斯岭回归模型

def stack_model(oof_1, oof_2, oof_3, predictions_1, predictions_2, predictions_3, y, eval_type='regression'):

"""

使用堆叠集成方法组合多个模型的预测结果

参数:

oof_1, oof_2, oof_3 -- 三个基础模型在训练集上的袋外预测(out-of-fold predictions)

predictions_1, predictions_2, predictions_3 -- 三个基础模型在测试集上的预测

y -- 训练集的真实目标值

eval_type -- 问题类型：'regression'(默认) 或 'binary'

oof -- 第二层模型在训练集上的预测

predictions -- 第二层模型在测试集上的预测

"""

# 将训练集和测试集的预测结果水平堆叠

# 最终形状: [n_samples, n_models]

train_stack = np.vstack([oof_1, oof_2, oof_3]).transpose() # 将三个模型的OOF预测垂直堆叠后转置

# print(train_stack)

test_stack = np.vstack([predictions_1, predictions_2, predictions_3]).transpose() # 将三个模型的测试集预测垂直堆叠后转置

# 导入交叉验证方法

from sklearn.model_selection import RepeatedKFold # 导入重复K折交叉验证

# 初始化交叉验证: 5折，不重复(n_repeats=1)，固定随机种子

folds = RepeatedKFold(n_splits=5, n_repeats=1, random_state=2018) # 创建5折交叉验证对象

# 初始化存储结果的数组

oof = np.zeros(train_stack.shape[0]) # 初始化训练集OOF预测数组

predictions = np.zeros(test_stack.shape[0]) # 初始化测试集预测数组

# 获取交叉验证的实际折数 (5折x1次重复=5)

n_splits = folds.get_n_splits() # 获取折数

n_repeats = folds.n_repeats # 获取重复次数

# 进行交叉验证训练

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_stack, y)): # 遍历每一折

print("fold n°{}".format(fold_+1)) # 打印当前折数

# 划分当前折的训练/验证数据

trn_data, trn_y = train_stack[trn_idx], y[trn_idx] # 获取当前折的训练数据

val_data, val_y = train_stack[val_idx], y[val_idx] # 获取当前折的验证数据

print("-" * 10 + "Stacking " + str(fold_+1) + "-" * 10) # 打印分隔线

# 初始化第二层模型 (贝叶斯岭回归)

clf = BayesianRidge() # 创建贝叶斯岭回归模型

# 在训练数据上拟合第二层模型

clf.fit(trn_data, trn_y) # 训练第二层模型

# 预测验证集并保存结果

oof[val_idx] = clf.predict(val_data) # 预测验证集并保存到OOF数组

# 预测测试集并累加 (后续取平均)

predictions += clf.predict(test_stack) / (n_splits * n_repeats) # 预测测试集并累加，最后取平均

# 根据问题类型评估性能

if eval_type == 'regression': # 如果是回归问题

from sklearn.metrics import mean_squared_error # 导入均方误差

# 回归问题: 计算均方根误差(RMSE)

print('RMSE: ', np.sqrt(mean_squared_error(y, oof))) # 计算并打印RMSE

elif eval_type == 'binary': # 如果是二分类问题

from sklearn.metrics import log_loss # 导入对数损失

# 二分类问题: 计算对数损失

print('Log Loss: ', log_loss(y, oof)) # 计算并打印对数损失

return oof, predictions # 返回OOF预测和测试集预测