作业一:基础应用 – 鸢尾花分类
任务目标:
使用随机森林对鸢尾花数据集进行分类,并分析特征重要性
数据集:
sklearn.datasets.load_iris()
要求步骤:
作业二:信用卡欺诈检测
任务目标:
使用随机森林处理类别不平衡的信用卡欺诈检测问题
数据集:
Kaggle信用卡欺诈数据集Credit Card Fraud
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
# 设置中文字体
plt.rcParams["font.sans-serif"] = ["SimHei"]
plt.rcParams["axes.unicode_minus"] = False
# 1. 加载鸢尾花数据集并划分训练集/测试集(70%/30%)
iris = load_iris()
X = iris.data
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# 2. 创建随机森林分类器(设置n_estimators=100, max_depth=3)
rf = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=42)
# 3. 训练模型并在测试集上评估准确率
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"\\n模型准确率: {accuracy:.4f}")
# 4. 输出分类报告和混淆矩阵
print("\\n分类报告:")
print(classification_report(y_test, y_pred, target_names=iris.target_names))
print("\\n混淆矩阵:")
cm = confusion_matrix(y_test, y_pred)
print(cm)
# 可视化混淆矩阵
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
xticklabels=iris.target_names,
yticklabels=iris.target_names)
plt.title('混淆矩阵')
plt.xlabel('预测标签')
plt.ylabel('真实标签')
# 5. 可视化特征重要性
feature_importance = rf.feature_importances_
feature_names = iris.feature_names
plt.subplot(1, 2, 2)
indices = np.argsort(feature_importance)[::-1]
plt.bar(range(len(feature_importance)), feature_importance[indices])
plt.xticks(range(len(feature_importance)), [feature_names[i] for i in indices], rotation=45)
plt.title('特征重要性')
plt.xlabel('特征')
plt.ylabel('重要性')
plt.tight_layout()
plt.show()
# 6. (选做)尝试调整n_estimators和max_depth观察准确率变化
print("\\n选做部分 – 参数调优:")
# 调整n_estimators
n_estimators_range = [10, 50, 100, 150, 200]
accuracies_n_est = []
for n_est in n_estimators_range:
rf_temp = RandomForestClassifier(n_estimators=n_est, max_depth=3, random_state=42)
rf_temp.fit(X_train, y_train)
y_pred_temp = rf_temp.predict(X_test)
acc = accuracy_score(y_test, y_pred_temp)
accuracies_n_est.append(acc)
print(f"n_estimators={n_est}, 准确率={acc:.4f}")
# 调整max_depth
max_depth_range = [1, 2, 3, 4, 5, 6, None]
accuracies_max_depth = []
for max_d in max_depth_range:
rf_temp = RandomForestClassifier(n_estimators=100, max_depth=max_d, random_state=42)
rf_temp.fit(X_train, y_train)
y_pred_temp = rf_temp.predict(X_test)
acc = accuracy_score(y_test, y_pred_temp)
accuracies_max_depth.append(acc)
if max_d is None:
print(f"max_depth=None, 准确率={acc:.4f}")
else:
print(f"max_depth={max_d}, 准确率={acc:.4f}")
# 可视化参数调整结果
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(n_estimators_range, accuracies_n_est, marker='o')
plt.title('n_estimators对准确率的影响')
plt.xlabel('n_estimators')
plt.ylabel('准确率')
plt.subplot(1, 2, 2)
x_labels = [str(d) if d is not None else 'None' for d in max_depth_range]
plt.plot(range(len(max_depth_range)), accuracies_max_depth, marker='o')
plt.xticks(range(len(max_depth_range)), x_labels)
plt.title('max_depth对准确率的影响')
plt.xlabel('max_depth')
plt.ylabel('准确率')
plt.tight_layout()
plt.show()
Credit Card Fraud
要求步骤:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, precision_score, recall_score, f1_score
# 步骤1: 加载信用卡交易数据
# 注意:需要先下载数据集并放在当前目录下
df = pd.read_csv("creditcard.csv")
# 检查数据不平衡情况
print("数据集大小:", df.shape)
print("类别分布:")
print(df['Class'].value_counts())
print("欺诈交易比例: {:.4f}%".format(df['Class'].sum() / len(df) * 100))
# 步骤2: 标准化Amount特征,删除Time特征
# 删除Time列
df = df.drop(['Time'], axis=1)
# 标准化Amount列
scaler = StandardScaler()
df['Amount'] = scaler.fit_transform(df['Amount'].values.reshape(-1, 1))
# 分离特征和标签
X = df.drop('Class', axis=1)
y = df['Class']
# 步骤3: 使用分层抽样划分训练集/测试集
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y)
print("训练集大小:", X_train.shape)
print("测试集大小:", X_test.shape)
print("训练集欺诈交易数:", sum(y_train))
print("测试集欺诈交易数:", sum(y_test))
# 步骤4: 创建随机森林分类器
rf_classifier = RandomForestClassifier(
n_estimators=100,
class_weight='balanced', # 处理类别不平衡
random_state=42
)
# 训练模型
rf_classifier.fit(X_train, y_train)
# 步骤5: 评估模型
# 预测
y_pred = rf_classifier.predict(X_test)
y_pred_proba = rf_classifier.predict_proba(X_test)[:, 1]
# 计算评估指标
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc_roc = roc_auc_score(y_test, y_pred_proba)
print("\\n模型评估结果:")
print("精确率(Precision): {:.4f}".format(precision))
print("召回率(Recall): {:.4f}".format(recall))
print("F1分数: {:.4f}".format(f1))
print("AUC-ROC: {:.4f}".format(auc_roc))
# 详细分类报告
print("\\n详细分类报告:")
print(classification_report(y_test, y_pred))
评论前必须登录!
注册