七三笔记

LightGBM LGBMClassifier xgb catboost 通用封装·实时参考

LightGBM

 
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss

# 加载乳腺癌数据集
data = load_breast_cancer()
X = data.data  # 特征
y = data.target  # 标签

# 将数据集划分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


params = {
    'bagging_fraction': 0.8,
        'feature_fraction': 0.9,
        'lambda_l1': 0.001,
        'lambda_l2': 0.001,
        'learning_rate': 0.01,
        'max_depth': 5,
        'metric': 'binary_logloss',
        'min_child_samples': 10,
        'min_data_in_leaf': 10,
        'min_gain_to_split': 1e-4,
        'n_estimators': 100,
        'num_leaves': 64,
        'num_threads': 4,
        'objective': 'binary'}



from tpf.mlib.lightgbm import lgbm_baseline
model = lgbm_baseline(X_train,
    y_train,
    X_test,
    y_test,
    cat_features=[],
    num_boost_round=100,
    params=params)

 
[LightGBM] [Warning] min_data_in_leaf is set=20, min_child_samples=10 will be ignored. Current value: min_data_in_leaf=20
[LightGBM] [Warning] min_data_in_leaf is set=20, min_child_samples=10 will be ignored. Current value: min_data_in_leaf=20
[LightGBM] [Info] Number of positive: 249, number of negative: 149
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000829 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3978
[LightGBM] [Info] Number of data points in the train set: 398, number of used features: 30
[LightGBM] [Warning] min_data_in_leaf is set=20, min_child_samples=10 will be ignored. Current value: min_data_in_leaf=20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.625628 -> initscore=0.513507
[LightGBM] [Info] Start training from score 0.513507
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best

 

params = {
    'bagging_fraction': 0.8,
        'feature_fraction': 0.9,
        'lambda_l1': 0.001,
        'lambda_l2': 0.001,
        'learning_rate': 0.01,
        'max_depth': 5,
        'metric': 'binary_logloss',
        'min_child_samples': 10,
        'min_data_in_leaf': 10,
        'min_gain_to_split': 1e-4,
        'n_estimators': 100,
        'num_leaves': 5,
        'num_threads': 4,
        'objective': 'binary'}



from tpf.mlib.lightgbm import lgbm_baseline
model = lgbm_baseline(X_train,
    y_train,
    X_test,
    y_test,
    cat_features=[],
    num_boost_round=3,
    params=params)

 
[LightGBM] [Warning] min_data_in_leaf is set=10, min_child_samples=10 will be ignored. Current value: min_data_in_leaf=10
[LightGBM] [Warning] min_data_in_leaf is set=10, min_child_samples=10 will be ignored. Current value: min_data_in_leaf=10
[LightGBM] [Info] Number of positive: 249, number of negative: 149
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000090 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3978
[LightGBM] [Info] Number of data points in the train set: 398, number of used features: 30
[LightGBM] [Warning] min_data_in_leaf is set=10, min_child_samples=10 will be ignored. Current value: min_data_in_leaf=10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.625628 -> initscore=0.513507
[LightGBM] [Info] Start training from score 0.513507

 
num_leaves:  一棵树中叶子节点的个数

 
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss

# 加载乳腺癌数据集
data = load_breast_cancer()
X = data.data  # 特征
y = data.target  # 标签

# 将数据集划分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


params = {
    'bagging_fraction': 0.8,
        'feature_fraction': 0.9,
        'lambda_l1': 0.001,
        'lambda_l2': 0.001,
        'learning_rate': 0.01,
        'max_depth': 5,
        'metric': 'binary_logloss',
        'min_child_samples': 10,
        'min_data_in_leaf': 10,
        'min_gain_to_split': 1e-4,
        'n_estimators': 100,
        'num_leaves': 5,
        'num_threads': 4,
        'objective': 'binary'}

 
from tpf.mlib import ModelTrain
mt = ModelTrain()

 
model = mt.train('lgbm',X_train,y_train,X_test,y_test,
cat_features=None,
num_boost_round=100,
params=params)

 
下面是LightGBM算法的参数
```
params = {
                'bagging_fraction': 0.8,
                    'feature_fraction': 0.9,
                    'lambda_l1': 0.001,
                    'lambda_l2': 0.001,
                    'learning_rate': 0.01,
                    'max_depth': 6,
                    'metric': 'binary_logloss',
                    'min_child_samples': 10,
                    'min_data_in_leaf': 10,
                    'min_gain_to_split': 1e-4,
                    'n_estimators': 100,
                    'num_leaves': 5,
                    'num_threads': 4,
                    'objective': 'binary',
                    'num_boost_round':100}
                    
lgb_model = lgb.train(params, train_data, 
                          num_boost_round=num_boost_round, 
                          valid_sets=[test_data], )

```
训练的精度如下 
'evaluation': {'acc': np.float64(0.997), 'precision': 0.9999, 'recall': np.float64(0.1628), 'f1': np.float64(0.28), 'auc': np.float64(0.9672)}

precision过高，而recall偏低，应该调用哪些参数，增加模型的泛化能力，以牺牲精度为代价提高一些召回率？

 
针对您的LightGBM模型precision过高而recall偏低的问题，建议调整以下参数：

## 关键参数调整：

### 1. 处理类别不平衡（最重要）
```python
'is_unbalance': True,  # 自动处理类别不平衡
# 或者明确指定权重
'scale_pos_weight': 10  # 尝试5-20之间的值，根据正负样本比例调整
```

### 2. 调整模型复杂度
```python
'max_depth': 4,        # 降低树深，防止过拟合（从6降到4）
'num_leaves': 31,      # 适当增加叶子数（从5增加到15-31）
'min_child_samples': 20,  # 增加最小子样本数（从10到20）
'min_data_in_leaf': 20,   # 增加叶节点最小数据量（从10到20）
```

### 3. 增强正则化
```python
'lambda_l1': 0.1,      # 增加L1正则化（从0.001到0.1）
'lambda_l2': 0.1,      # 增加L2正则化（从0.001到0.1）
'bagging_freq': 5,     # 添加bagging频率
```

### 4. 学习策略调整
```python
'learning_rate': 0.05, # 提高学习率（从0.01到0.05）
'n_estimators': 200,   # 增加迭代次数
```

## 修改后的参数建议：

```python
params = {
    'bagging_fraction': 0.8,
    'feature_fraction': 0.9,
    'lambda_l1': 0.1,          # 增强正则化
    'lambda_l2': 0.1,          # 增强正则化
    'learning_rate': 0.05,     # 提高学习率
    'max_depth': 4,            # 降低复杂度
    'metric': 'binary_logloss',
    'min_child_samples': 20,   # 防止过拟合
    'min_data_in_leaf': 20,    # 防止过拟合
    'min_gain_to_split': 1e-4,
    'n_estimators': 200,       # 增加迭代
    'num_leaves': 31,          # 适当增加叶子
    'num_threads': 4,
    'objective': 'binary',
    'is_unbalance': True,      # 处理类别不平衡
    'bagging_freq': 5,         # 每5次迭代执行bagging
    'boosting_type': 'gbdt'
}
```

## 训练代码调整：

```python
lgb_model = lgb.train(params, train_data, 
                      num_boost_round=200,  # 对应n_estimators
                      valid_sets=[test_data],
                      early_stopping_rounds=50,  # 添加早停
                      verbose_eval=50)
```

## 额外建议：

1. **调整预测阈值**：训练后尝试不同的分类阈值（如从0.5降到0.3-0.4）
2. **使用F1-score作为早停指标**：`early_stopping_rounds`配合F1-score监控
3. **添加交叉验证**：确保参数调整的稳定性

这些调整会降低precision但显著提高recall，改善F1-score，并增强模型泛化能力。

 
res.append({
  'alg_type': 'lgbm', 
  'model_path': '/ai/data/model/lgbm_1417.pkl', 
  'scale_path': '/ai/data/model/lgbm_minmaxscaler_1417.pkl', 
  'evaluation': {
      'acc': np.float64(0.9735), 
      'precision': np.float64(0.0857), 
      'recall': np.float64(0.6628), 
      'f1': np.float64(0.1518), 
      'auc': np.float64(0.9127)}}
)

LGBMClassifier

 
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss

# 加载乳腺癌数据集
data = load_breast_cancer()
X = data.data  # 特征
y = data.target  # 标签

# 将数据集划分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

from tpf.mlib import ModelTrain
mt = ModelTrain()

 
params={
    "boosting_type": 'gbdt',
    "objective": 'binary',
    "class_weight": None,
    "learning_rate":0.01,
    "max_depth": -1,
    "lambda_l1": 0.01,
    "lambda_l2": 0.01,
    "min_child_samples": 10,
    "min_data_in_leaf": 30,
    "bagging_fraction": 0.8,
    "feature_fraction": 0.9,
    "early_stopping_rounds":20,
    "n_estimators":300,
    "num_leaves":30,
    "verbose": -1
}

model = mt.train('lgbmc',X_train,y_train,X_test,y_test,
    cat_features=None,
    params=params)

 
Training until validation scores don't improve for 20 rounds
[20]	train's binary_logloss: 0.530186	valid's binary_logloss: 0.529607
[40]	train's binary_logloss: 0.437406	valid's binary_logloss: 0.439458
[60]	train's binary_logloss: 0.366415	valid's binary_logloss: 0.371285
[80]	train's binary_logloss: 0.311904	valid's binary_logloss: 0.319237
[100]	train's binary_logloss: 0.267372	valid's binary_logloss: 0.277919
[120]	train's binary_logloss: 0.231631	valid's binary_logloss: 0.244746
[140]	train's binary_logloss: 0.202447	valid's binary_logloss: 0.218923
[160]	train's binary_logloss: 0.178266	valid's binary_logloss: 0.197623
[180]	train's binary_logloss: 0.157208	valid's binary_logloss: 0.179697
[200]	train's binary_logloss: 0.138633	valid's binary_logloss: 0.164147
[220]	train's binary_logloss: 0.122707	valid's binary_logloss: 0.151118
[240]	train's binary_logloss: 0.108282	valid's binary_logloss: 0.140934
[260]	train's binary_logloss: 0.0962268	valid's binary_logloss: 0.132772
[280]	train's binary_logloss: 0.085165	valid's binary_logloss: 0.125964
[300]	train's binary_logloss: 0.075422	valid's binary_logloss: 0.119737
Did not meet early stopping. Best iteration is:
[300]	train's binary_logloss: 0.075422	valid's binary_logloss: 0.119737

 
'is_unbalance': False,  # 移除这个，与class_weight冲突
'class_weight': 'balanced',  # 使用自动平衡权重
# 或者
'scale_pos_weight': 100  # 根据正负样本比例手动设置

 
{'alg_type': 'lgbmc', 'model_path': '/ai/data/model/lgbmc_1417.pkl', 
'scale_path': '/ai/data/model/lgbmc_minmaxscaler_1417.pkl', 
'evaluation': {
  'acc': np.float64(0.9964), 'precision': np.int64(0), 
  'recall': np.float64(0.0), 'f1': np.float64(0.0), 'auc': np.float64(0.9191)}}


'is_unbalance': True 移除后 

{'alg_type': 'lgbmc', 'model_path': '/ai/data/model/lgbmc_1417.pkl', 'scale_path': '/ai/data/model/lgbmc_minmaxscaler_1417.pkl', 
'evaluation': {'acc': np.float64(0.9975), 'precision': np.float64(0.963), 
'recall': np.float64(0.3023), 'f1': np.float64(0.4602), 
'auc': np.float64(0.9786)}}

 
看到您的评估结果（precision: 0, recall: 0.0, f1: 0.0），这表明模型完全无法识别正类样本。需要从根本上调整参数：

## 关键问题诊断：
1. `'is_unbalance': True` 和 `"class_weight": None` 同时存在可能冲突
2. 参数设置过于保守，导致模型无法学习到正类特征

## 急需调整的参数：

### 1. 移除冲突参数，明确类别权重
```python
'is_unbalance': False,  # 移除这个，与class_weight冲突
'class_weight': 'balanced',  # 使用自动平衡权重
# 或者
'scale_pos_weight': 100  # 根据正负样本比例手动设置
```

### 2. 降低模型复杂度，让模型更容易学习
```python
"max_depth": 5,          # 从-1改为具体数值（3-8）
"num_leaves": 15,        # 减少叶子数（从30降到15-20）
"min_data_in_leaf": 10,  # 减少叶节点最小数据量（从30降到10-20）
"min_child_samples": 5,  # 减少最小子样本数（从10降到5-10）
```

### 3. 调整正则化强度
```python
"lambda_l1": 0.001,      # 降低L1正则化（从0.01到0.001）
"lambda_l2": 0.001,      # 降低L2正则化（从0.01到0.001）
```

### 4. 优化学习策略
```python
"learning_rate": 0.05,   # 提高学习率（从0.01到0.05）
"n_estimators": 500,     # 增加迭代次数
```

## 修改后的参数建议：

```python
params={
    "boosting_type": 'gbdt',
    "objective": 'binary',
    "class_weight": 'balanced',  # 使用平衡权重
    "learning_rate": 0.05,
    "max_depth": 5,
    "lambda_l1": 0.001,
    "lambda_l2": 0.001,
    "min_child_samples": 5,
    "min_data_in_leaf": 10,
    "bagging_fraction": 0.8,
    "feature_fraction": 0.9,
    "early_stopping_rounds": 50,  # 增加早停轮数
    "n_estimators": 500,
    "num_leaves": 15,
    "is_unbalance": False,  # 移除冲突参数
    "verbose": -1
}
```

## 额外重要建议：

### 1. 检查数据预处理
```python
# 确保标签正确编码
y = y.astype(int)
# 检查正负样本比例
print(f"正样本比例: {sum(y == 1) / len(y):.4f}")
```

### 2. 添加监控指标
```python
# 在训练时添加评估指标
eval_metric = ['binary_error', 'auc', 'binary_logloss']
```

### 3. 尝试不同的boosting类型
```python
# 如果效果仍不好，尝试dart或goss
"boosting_type": 'dart',  # 或者 'goss'
"drop_rate": 0.1,         # 如果使用dart
```

### 4. 确保正确使用LGBMClassifier
```python
from lightgbm import LGBMClassifier

model = LGBMClassifier(**params)
model.fit(X_train, y_train, 
          eval_set=[(X_test, y_test)],
          verbose=False)
```

这些调整应该能帮助模型开始识别正类样本，显著提高recall和precision。

 
{'alg_type': 'lgbmc', 'model_path': '/ai/data/model/lgbmc_1417.pkl', 
'scale_path': '/ai/data/model/lgbmc_minmaxscaler_1417.pkl', 
'evaluation': {
  'acc': np.float64(0.9873), 'precision': np.float64(0.1506), 
  'recall': np.float64(0.5465), 'f1': np.float64(0.2362), 
  'auc': np.float64(0.9584)}}

xgb

 
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss

# 加载乳腺癌数据集
data = load_breast_cancer()
X = data.data  # 特征
y = data.target  # 标签

# 将数据集划分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

 
from tpf.mlib import ModelTrain
mt = ModelTrain()

params =  {
        "booster": 'gbtree',
        "objective": 'binary:logistic',
        "max_depth": 6,
        "learning_rate": 0.03,
        "n_estimators": 1000,
        "min_child_weight": 1,
        "gamma": 0,
        "subsample": 1.0,
        "colsample_bytree": 1.0,
        "reg_alpha": 0,
        "reg_lambda": 1,
        "verbosity": 1,
        "eval_metric": 'logloss',
        "use_best_model": True,
        "early_stopping_rounds": 20
    }

model = mt.train('xgbc',X_train,y_train,X_test,y_test,
    params=params)

 
[0]	validation_0-logloss:0.63410
[1]	validation_0-logloss:0.61127

[5]	validation_0-logloss:0.53326
[6]	validation_0-logloss:0.51671
[7]	validation_0-logloss:0.50111
[8]	validation_0-logloss:0.48594
[9]	validation_0-logloss:0.47171

[18]	validation_0-logloss:0.36734
[19]	validation_0-logloss:0.35824
[20]	validation_0-logloss:0.34948

[30]	validation_0-logloss:0.27944
[31]	validation_0-logloss:0.27358
[32]	validation_0-logloss:0.26813

[41]	validation_0-logloss:0.22677
[42]	validation_0-logloss:0.22276
[43]	validation_0-logloss:0.21895

[52]	validation_0-logloss:0.19010
[53]	validation_0-logloss:0.18755
[54]	validation_0-logloss:0.18535

[64]	validation_0-logloss:0.16247
[65]	validation_0-logloss:0.16058

[131]	validation_0-logloss:0.10422
[132]	validation_0-logloss:0.10409
[133]	validation_0-logloss:0.10380
[134]	validation_0-logloss:0.10369

[140]	validation_0-logloss:0.10235
[141]	validation_0-logloss:0.10197

[263]	validation_0-logloss:0.09296
[264]	validation_0-logloss:0.09304
[265]	validation_0-logloss:0.09320

[316]	validation_0-logloss:0.09144
[317]	validation_0-logloss:0.09149

[324]	validation_0-logloss:0.09169
[325]	validation_0-logloss:0.09171

 
# Update your XGBoost parameters to enable categorical features
params = {
    "booster": 'gbtree',
    "objective": 'binary:logistic',
    "max_depth": 6,
    "learning_rate": 0.03,
    "n_estimators": 1000,
    "min_child_weight": 1,
    "gamma": 0,
    "subsample": 1.0,
    "colsample_bytree": 1.0,
    "reg_alpha": 0,
    "reg_lambda": 1,
    "verbosity": 1,
    "eval_metric": 'logloss',
    "use_best_model": True,
    "early_stopping_rounds": 20,
    "enable_categorical": True  # Add this parameter
}

model = xgb.XGBClassifier(**params)

转category，string不行，必须 category

 
df[pc.col_type.classify_type] = df[pc.col_type.classify_type].astype(str)
df[pc.col_type.classify_type] = df[pc.col_type.classify_type].astype("category")

catboost

封装示例

 
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss

# 加载乳腺癌数据集
data = load_breast_cancer()
X = data.data  # 特征
y = data.target  # 标签

# 将数据集划分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

 
from tpf.mlib import ModelTrain
mt = ModelTrain()

params = {
        "iterations": 1000,   #与num_trees,num_boost_round,n_estimators同义
        "learning_rate": 0.03,
        "depth": 6,
        "l2_leaf_reg": 3.0,
        "random_strength": 1.0,
        "bagging_temperature": 1.0,
        "grow_policy": 'SymmetricTree',
        "eval_metric": 'Logloss',
        "verbose": 100,
        "early_stopping_rounds": 50,
        "task_type": 'CPU',
        "subsample": 1.0,
        "rsm": 1.0,
        "border_count": 254,
        "loss_function": 'Logloss',
        "use_best_model": True,
        "od_type": 'Iter'
    }

model = mt.train('catboost',X_train,y_train,X_test,y_test,
    cat_features=None,
    params=params)

 
0:	learn: 0.6484000	test: 0.6481265	best: 0.6481265 (0)	total: 58.9ms	remaining: 58.8s
100:	learn: 0.0369051	test: 0.0833139	best: 0.0833139 (100)	total: 173ms	remaining: 1.54s
200:	learn: 0.0132401	test: 0.0717253	best: 0.0717253 (200)	total: 276ms	remaining: 1.09s
300:	learn: 0.0062749	test: 0.0704805	best: 0.0681547 (266)	total: 359ms	remaining: 835ms
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.06815471898
bestIteration = 266

Shrink model to first 267 iterations.

通用封装·实时

数据
lgbm

 
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss

# 加载乳腺癌数据集
data = load_breast_cancer()
X = data.data  # 特征
y = data.target  # 标签

# 将数据集划分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

 
 
from tpf.mlib import ModelTrain
mt = ModelTrain()

model = mt.train('lgbm',X_train,y_train,X_test,y_test,
cat_features=None,
params=None)


from tpf.mlib import ModelTrain
mt = ModelTrain()

params = {
  'bagging_fraction': 0.8,
      'feature_fraction': 0.9,
      'lambda_l1': 0.001,
      'lambda_l2': 0.001,
      'learning_rate': 0.01,
      'max_depth': 5,
      'metric': 'binary_logloss',
      'min_child_samples': 10,
      'min_data_in_leaf': 10,
      'min_gain_to_split': 1e-4,
      'n_estimators': 100,
      'num_leaves': 5,
      'num_threads': 4,
      'objective': 'binary',
      'num_boost_round':100}

model = mt.train('lgbm',X_train,y_train,X_test,y_test,
cat_features=None,
params=params)

参考

七三笔记路线：学习，记录，分享