import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss
# 加载乳腺癌数据集
data = load_breast_cancer()
X = data.data # 特征
y = data.target # 标签
# 将数据集划分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
params = {
'bagging_fraction': 0.8,
'feature_fraction': 0.9,
'lambda_l1': 0.001,
'lambda_l2': 0.001,
'learning_rate': 0.01,
'max_depth': 5,
'metric': 'binary_logloss',
'min_child_samples': 10,
'min_data_in_leaf': 10,
'min_gain_to_split': 1e-4,
'n_estimators': 100,
'num_leaves': 64,
'num_threads': 4,
'objective': 'binary'}
from tpf.mlib.lightgbm import lgbm_baseline
model = lgbm_baseline(X_train,
y_train,
X_test,
y_test,
cat_features=[],
num_boost_round=100,
params=params)
[LightGBM] [Warning] min_data_in_leaf is set=20, min_child_samples=10 will be ignored. Current value: min_data_in_leaf=20
[LightGBM] [Warning] min_data_in_leaf is set=20, min_child_samples=10 will be ignored. Current value: min_data_in_leaf=20
[LightGBM] [Info] Number of positive: 249, number of negative: 149
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000829 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3978
[LightGBM] [Info] Number of data points in the train set: 398, number of used features: 30
[LightGBM] [Warning] min_data_in_leaf is set=20, min_child_samples=10 will be ignored. Current value: min_data_in_leaf=20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.625628 -> initscore=0.513507
[LightGBM] [Info] Start training from score 0.513507
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best
params = {
'bagging_fraction': 0.8,
'feature_fraction': 0.9,
'lambda_l1': 0.001,
'lambda_l2': 0.001,
'learning_rate': 0.01,
'max_depth': 5,
'metric': 'binary_logloss',
'min_child_samples': 10,
'min_data_in_leaf': 10,
'min_gain_to_split': 1e-4,
'n_estimators': 100,
'num_leaves': 5,
'num_threads': 4,
'objective': 'binary'}
from tpf.mlib.lightgbm import lgbm_baseline
model = lgbm_baseline(X_train,
y_train,
X_test,
y_test,
cat_features=[],
num_boost_round=3,
params=params)
[LightGBM] [Warning] min_data_in_leaf is set=10, min_child_samples=10 will be ignored. Current value: min_data_in_leaf=10
[LightGBM] [Warning] min_data_in_leaf is set=10, min_child_samples=10 will be ignored. Current value: min_data_in_leaf=10
[LightGBM] [Info] Number of positive: 249, number of negative: 149
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000090 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3978
[LightGBM] [Info] Number of data points in the train set: 398, number of used features: 30
[LightGBM] [Warning] min_data_in_leaf is set=10, min_child_samples=10 will be ignored. Current value: min_data_in_leaf=10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.625628 -> initscore=0.513507
[LightGBM] [Info] Start training from score 0.513507
num_leaves: 一棵树中叶子节点的个数
|
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss
# 加载乳腺癌数据集
data = load_breast_cancer()
X = data.data # 特征
y = data.target # 标签
# 将数据集划分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
params = {
'bagging_fraction': 0.8,
'feature_fraction': 0.9,
'lambda_l1': 0.001,
'lambda_l2': 0.001,
'learning_rate': 0.01,
'max_depth': 5,
'metric': 'binary_logloss',
'min_child_samples': 10,
'min_data_in_leaf': 10,
'min_gain_to_split': 1e-4,
'n_estimators': 100,
'num_leaves': 5,
'num_threads': 4,
'objective': 'binary'}
from tpf.mlib import ModelTrain mt = ModelTrain()
model = mt.train('lgbm',X_train,y_train,X_test,y_test,
cat_features=None,
num_boost_round=100,
params=params)
|
下面是LightGBM算法的参数
```
params = {
'bagging_fraction': 0.8,
'feature_fraction': 0.9,
'lambda_l1': 0.001,
'lambda_l2': 0.001,
'learning_rate': 0.01,
'max_depth': 6,
'metric': 'binary_logloss',
'min_child_samples': 10,
'min_data_in_leaf': 10,
'min_gain_to_split': 1e-4,
'n_estimators': 100,
'num_leaves': 5,
'num_threads': 4,
'objective': 'binary',
'num_boost_round':100}
lgb_model = lgb.train(params, train_data,
num_boost_round=num_boost_round,
valid_sets=[test_data], )
```
训练的精度如下
'evaluation': {'acc': np.float64(0.997), 'precision': 0.9999, 'recall': np.float64(0.1628), 'f1': np.float64(0.28), 'auc': np.float64(0.9672)}
precision过高,而recall偏低,应该调用哪些参数,增加模型的泛化能力,以牺牲精度为代价提高一些召回率?
针对您的LightGBM模型precision过高而recall偏低的问题,建议调整以下参数:
## 关键参数调整:
### 1. 处理类别不平衡(最重要)
```python
'is_unbalance': True, # 自动处理类别不平衡
# 或者明确指定权重
'scale_pos_weight': 10 # 尝试5-20之间的值,根据正负样本比例调整
```
### 2. 调整模型复杂度
```python
'max_depth': 4, # 降低树深,防止过拟合(从6降到4)
'num_leaves': 31, # 适当增加叶子数(从5增加到15-31)
'min_child_samples': 20, # 增加最小子样本数(从10到20)
'min_data_in_leaf': 20, # 增加叶节点最小数据量(从10到20)
```
### 3. 增强正则化
```python
'lambda_l1': 0.1, # 增加L1正则化(从0.001到0.1)
'lambda_l2': 0.1, # 增加L2正则化(从0.001到0.1)
'bagging_freq': 5, # 添加bagging频率
```
### 4. 学习策略调整
```python
'learning_rate': 0.05, # 提高学习率(从0.01到0.05)
'n_estimators': 200, # 增加迭代次数
```
## 修改后的参数建议:
```python
params = {
'bagging_fraction': 0.8,
'feature_fraction': 0.9,
'lambda_l1': 0.1, # 增强正则化
'lambda_l2': 0.1, # 增强正则化
'learning_rate': 0.05, # 提高学习率
'max_depth': 4, # 降低复杂度
'metric': 'binary_logloss',
'min_child_samples': 20, # 防止过拟合
'min_data_in_leaf': 20, # 防止过拟合
'min_gain_to_split': 1e-4,
'n_estimators': 200, # 增加迭代
'num_leaves': 31, # 适当增加叶子
'num_threads': 4,
'objective': 'binary',
'is_unbalance': True, # 处理类别不平衡
'bagging_freq': 5, # 每5次迭代执行bagging
'boosting_type': 'gbdt'
}
```
## 训练代码调整:
```python
lgb_model = lgb.train(params, train_data,
num_boost_round=200, # 对应n_estimators
valid_sets=[test_data],
early_stopping_rounds=50, # 添加早停
verbose_eval=50)
```
## 额外建议:
1. **调整预测阈值**:训练后尝试不同的分类阈值(如从0.5降到0.3-0.4)
2. **使用F1-score作为早停指标**:`early_stopping_rounds`配合F1-score监控
3. **添加交叉验证**:确保参数调整的稳定性
这些调整会降低precision但显著提高recall,改善F1-score,并增强模型泛化能力。
res.append({
'alg_type': 'lgbm',
'model_path': '/ai/data/model/lgbm_1417.pkl',
'scale_path': '/ai/data/model/lgbm_minmaxscaler_1417.pkl',
'evaluation': {
'acc': np.float64(0.9735),
'precision': np.float64(0.0857),
'recall': np.float64(0.6628),
'f1': np.float64(0.1518),
'auc': np.float64(0.9127)}}
)
|
|
|
|
|
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss
# 加载乳腺癌数据集
data = load_breast_cancer()
X = data.data # 特征
y = data.target # 标签
# 将数据集划分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
from tpf.mlib import ModelTrain
mt = ModelTrain()
params={
"boosting_type": 'gbdt',
"objective": 'binary',
"class_weight": None,
"learning_rate":0.01,
"max_depth": -1,
"lambda_l1": 0.01,
"lambda_l2": 0.01,
"min_child_samples": 10,
"min_data_in_leaf": 30,
"bagging_fraction": 0.8,
"feature_fraction": 0.9,
"early_stopping_rounds":20,
"n_estimators":300,
"num_leaves":30,
"verbose": -1
}
model = mt.train('lgbmc',X_train,y_train,X_test,y_test,
cat_features=None,
params=params)
Training until validation scores don't improve for 20 rounds
[20] train's binary_logloss: 0.530186 valid's binary_logloss: 0.529607
[40] train's binary_logloss: 0.437406 valid's binary_logloss: 0.439458
[60] train's binary_logloss: 0.366415 valid's binary_logloss: 0.371285
[80] train's binary_logloss: 0.311904 valid's binary_logloss: 0.319237
[100] train's binary_logloss: 0.267372 valid's binary_logloss: 0.277919
[120] train's binary_logloss: 0.231631 valid's binary_logloss: 0.244746
[140] train's binary_logloss: 0.202447 valid's binary_logloss: 0.218923
[160] train's binary_logloss: 0.178266 valid's binary_logloss: 0.197623
[180] train's binary_logloss: 0.157208 valid's binary_logloss: 0.179697
[200] train's binary_logloss: 0.138633 valid's binary_logloss: 0.164147
[220] train's binary_logloss: 0.122707 valid's binary_logloss: 0.151118
[240] train's binary_logloss: 0.108282 valid's binary_logloss: 0.140934
[260] train's binary_logloss: 0.0962268 valid's binary_logloss: 0.132772
[280] train's binary_logloss: 0.085165 valid's binary_logloss: 0.125964
[300] train's binary_logloss: 0.075422 valid's binary_logloss: 0.119737
Did not meet early stopping. Best iteration is:
[300] train's binary_logloss: 0.075422 valid's binary_logloss: 0.119737
|
'is_unbalance': False, # 移除这个,与class_weight冲突 'class_weight': 'balanced', # 使用自动平衡权重 # 或者 'scale_pos_weight': 100 # 根据正负样本比例手动设置
{'alg_type': 'lgbmc', 'model_path': '/ai/data/model/lgbmc_1417.pkl',
'scale_path': '/ai/data/model/lgbmc_minmaxscaler_1417.pkl',
'evaluation': {
'acc': np.float64(0.9964), 'precision': np.int64(0),
'recall': np.float64(0.0), 'f1': np.float64(0.0), 'auc': np.float64(0.9191)}}
'is_unbalance': True 移除后
{'alg_type': 'lgbmc', 'model_path': '/ai/data/model/lgbmc_1417.pkl', 'scale_path': '/ai/data/model/lgbmc_minmaxscaler_1417.pkl',
'evaluation': {'acc': np.float64(0.9975), 'precision': np.float64(0.963),
'recall': np.float64(0.3023), 'f1': np.float64(0.4602),
'auc': np.float64(0.9786)}}
看到您的评估结果(precision: 0, recall: 0.0, f1: 0.0),这表明模型完全无法识别正类样本。需要从根本上调整参数:
## 关键问题诊断:
1. `'is_unbalance': True` 和 `"class_weight": None` 同时存在可能冲突
2. 参数设置过于保守,导致模型无法学习到正类特征
## 急需调整的参数:
### 1. 移除冲突参数,明确类别权重
```python
'is_unbalance': False, # 移除这个,与class_weight冲突
'class_weight': 'balanced', # 使用自动平衡权重
# 或者
'scale_pos_weight': 100 # 根据正负样本比例手动设置
```
### 2. 降低模型复杂度,让模型更容易学习
```python
"max_depth": 5, # 从-1改为具体数值(3-8)
"num_leaves": 15, # 减少叶子数(从30降到15-20)
"min_data_in_leaf": 10, # 减少叶节点最小数据量(从30降到10-20)
"min_child_samples": 5, # 减少最小子样本数(从10降到5-10)
```
### 3. 调整正则化强度
```python
"lambda_l1": 0.001, # 降低L1正则化(从0.01到0.001)
"lambda_l2": 0.001, # 降低L2正则化(从0.01到0.001)
```
### 4. 优化学习策略
```python
"learning_rate": 0.05, # 提高学习率(从0.01到0.05)
"n_estimators": 500, # 增加迭代次数
```
## 修改后的参数建议:
```python
params={
"boosting_type": 'gbdt',
"objective": 'binary',
"class_weight": 'balanced', # 使用平衡权重
"learning_rate": 0.05,
"max_depth": 5,
"lambda_l1": 0.001,
"lambda_l2": 0.001,
"min_child_samples": 5,
"min_data_in_leaf": 10,
"bagging_fraction": 0.8,
"feature_fraction": 0.9,
"early_stopping_rounds": 50, # 增加早停轮数
"n_estimators": 500,
"num_leaves": 15,
"is_unbalance": False, # 移除冲突参数
"verbose": -1
}
```
## 额外重要建议:
### 1. 检查数据预处理
```python
# 确保标签正确编码
y = y.astype(int)
# 检查正负样本比例
print(f"正样本比例: {sum(y == 1) / len(y):.4f}")
```
### 2. 添加监控指标
```python
# 在训练时添加评估指标
eval_metric = ['binary_error', 'auc', 'binary_logloss']
```
### 3. 尝试不同的boosting类型
```python
# 如果效果仍不好,尝试dart或goss
"boosting_type": 'dart', # 或者 'goss'
"drop_rate": 0.1, # 如果使用dart
```
### 4. 确保正确使用LGBMClassifier
```python
from lightgbm import LGBMClassifier
model = LGBMClassifier(**params)
model.fit(X_train, y_train,
eval_set=[(X_test, y_test)],
verbose=False)
```
这些调整应该能帮助模型开始识别正类样本,显著提高recall和precision。
{'alg_type': 'lgbmc', 'model_path': '/ai/data/model/lgbmc_1417.pkl',
'scale_path': '/ai/data/model/lgbmc_minmaxscaler_1417.pkl',
'evaluation': {
'acc': np.float64(0.9873), 'precision': np.float64(0.1506),
'recall': np.float64(0.5465), 'f1': np.float64(0.2362),
'auc': np.float64(0.9584)}}
|
|
|
|
|
|
|
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss
# 加载乳腺癌数据集
data = load_breast_cancer()
X = data.data # 特征
y = data.target # 标签
# 将数据集划分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape
from tpf.mlib import ModelTrain
mt = ModelTrain()
params = {
"booster": 'gbtree',
"objective": 'binary:logistic',
"max_depth": 6,
"learning_rate": 0.03,
"n_estimators": 1000,
"min_child_weight": 1,
"gamma": 0,
"subsample": 1.0,
"colsample_bytree": 1.0,
"reg_alpha": 0,
"reg_lambda": 1,
"verbosity": 1,
"eval_metric": 'logloss',
"use_best_model": True,
"early_stopping_rounds": 20
}
model = mt.train('xgbc',X_train,y_train,X_test,y_test,
params=params)
[0] validation_0-logloss:0.63410 [1] validation_0-logloss:0.61127 [5] validation_0-logloss:0.53326 [6] validation_0-logloss:0.51671 [7] validation_0-logloss:0.50111 [8] validation_0-logloss:0.48594 [9] validation_0-logloss:0.47171 [18] validation_0-logloss:0.36734 [19] validation_0-logloss:0.35824 [20] validation_0-logloss:0.34948 [30] validation_0-logloss:0.27944 [31] validation_0-logloss:0.27358 [32] validation_0-logloss:0.26813 [41] validation_0-logloss:0.22677 [42] validation_0-logloss:0.22276 [43] validation_0-logloss:0.21895 [52] validation_0-logloss:0.19010 [53] validation_0-logloss:0.18755 [54] validation_0-logloss:0.18535 [64] validation_0-logloss:0.16247 [65] validation_0-logloss:0.16058 [131] validation_0-logloss:0.10422 [132] validation_0-logloss:0.10409 [133] validation_0-logloss:0.10380 [134] validation_0-logloss:0.10369 [140] validation_0-logloss:0.10235 [141] validation_0-logloss:0.10197 [263] validation_0-logloss:0.09296 [264] validation_0-logloss:0.09304 [265] validation_0-logloss:0.09320 [316] validation_0-logloss:0.09144 [317] validation_0-logloss:0.09149 [324] validation_0-logloss:0.09169 [325] validation_0-logloss:0.09171 |
# Update your XGBoost parameters to enable categorical features
params = {
"booster": 'gbtree',
"objective": 'binary:logistic',
"max_depth": 6,
"learning_rate": 0.03,
"n_estimators": 1000,
"min_child_weight": 1,
"gamma": 0,
"subsample": 1.0,
"colsample_bytree": 1.0,
"reg_alpha": 0,
"reg_lambda": 1,
"verbosity": 1,
"eval_metric": 'logloss',
"use_best_model": True,
"early_stopping_rounds": 20,
"enable_categorical": True # Add this parameter
}
model = xgb.XGBClassifier(**params)
转category,string不行,必须 category
df[pc.col_type.classify_type] = df[pc.col_type.classify_type].astype(str)
df[pc.col_type.classify_type] = df[pc.col_type.classify_type].astype("category")
|
|
|
|
|
|
|
import numpy as np import pandas as pd import lightgbm as lgb from sklearn.datasets import load_breast_cancer from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score, roc_auc_score, log_loss # 加载乳腺癌数据集 data = load_breast_cancer() X = data.data # 特征 y = data.target # 标签 # 将数据集划分为训练集和测试集 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) X_train.shape, X_test.shape, y_train.shape, y_test.shape
from tpf.mlib import ModelTrain
mt = ModelTrain()
params = {
"iterations": 1000, #与num_trees,num_boost_round,n_estimators同义
"learning_rate": 0.03,
"depth": 6,
"l2_leaf_reg": 3.0,
"random_strength": 1.0,
"bagging_temperature": 1.0,
"grow_policy": 'SymmetricTree',
"eval_metric": 'Logloss',
"verbose": 100,
"early_stopping_rounds": 50,
"task_type": 'CPU',
"subsample": 1.0,
"rsm": 1.0,
"border_count": 254,
"loss_function": 'Logloss',
"use_best_model": True,
"od_type": 'Iter'
}
model = mt.train('catboost',X_train,y_train,X_test,y_test,
cat_features=None,
params=params)
0: learn: 0.6484000 test: 0.6481265 best: 0.6481265 (0) total: 58.9ms remaining: 58.8s
100: learn: 0.0369051 test: 0.0833139 best: 0.0833139 (100) total: 173ms remaining: 1.54s
200: learn: 0.0132401 test: 0.0717253 best: 0.0717253 (200) total: 276ms remaining: 1.09s
300: learn: 0.0062749 test: 0.0704805 best: 0.0681547 (266) total: 359ms remaining: 835ms
Stopped by overfitting detector (50 iterations wait)
bestTest = 0.06815471898
bestIteration = 266
Shrink model to first 267 iterations.
|
|
|
|
|
|
|
|
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss
# 加载乳腺癌数据集
data = load_breast_cancer()
X = data.data # 特征
y = data.target # 标签
# 将数据集划分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
|
from tpf.mlib import ModelTrain
mt = ModelTrain()
model = mt.train('lgbm',X_train,y_train,X_test,y_test,
cat_features=None,
params=None)
from tpf.mlib import ModelTrain
mt = ModelTrain()
params = {
'bagging_fraction': 0.8,
'feature_fraction': 0.9,
'lambda_l1': 0.001,
'lambda_l2': 0.001,
'learning_rate': 0.01,
'max_depth': 5,
'metric': 'binary_logloss',
'min_child_samples': 10,
'min_data_in_leaf': 10,
'min_gain_to_split': 1e-4,
'n_estimators': 100,
'num_leaves': 5,
'num_threads': 4,
'objective': 'binary',
'num_boost_round':100}
model = mt.train('lgbm',X_train,y_train,X_test,y_test,
cat_features=None,
params=params)
|
|
|
|
|
|
|