广告投放中的CTR预估模型

import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import gc

import os

import lightgbm as lgb

数据清洗

import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import gc

import os

import lightgbm as lgb
data_path = './datasets/'
train = pd.read_csv(os.path.join(data_path,'train.csv'))
test = pd.read_csv(os.path.join(data_path,'test.csv'))

特征字段 字段描述 id 用户行为id,唯一表示,无重复 date 行为时间,精确到秒 user_id 用户id product 产品 campaign_id 活动id webpage_id 网页id product_category_id 产品类型id user_group_id 用户所属群组id gender 性别 age_level 年龄等级 user_depth 用户价值深度 var_1 匿名特征 isClick 是否点击,1为点击,0为未点击

train
test
data = pd.concat([train,test],ignore_index=True)
data
data['day_id'] = data['date'].apply(lambda x:int(x[3:5]))
data['minute_id']=data['date'].apply(lambda x:int(x[-5:-3])*60 + int(x[-2:]))
data

特征工程

构建用户每天前后两次浏览行为之间的时间间隔及其衍生均值特征, 因为用户浏览时间往往和其是否点击具有相关性

data['minute_id'].shift(-1)
0            0.0
1            0.0
2            0.0
3            1.0
4            1.0
           ...  
463286    1439.0
463287    1439.0
463288    1439.0
463289    1439.0
463290       NaN
Name: minute_id, Length: 463291, dtype: float64
data.groupby(['user_id','day_id'])['minute_id'].apply(lambda x :x.shift(-1) -x)
0         NaN
1         0.0
2         2.0
3         NaN
4         NaN
         ... 
463286    NaN
463287    NaN
463288    0.0
463289    NaN
463290    NaN
Name: minute_id, Length: 463291, dtype: float64
data.groupby(['user_id','day_id'])['minute_id'].agg(lambda x :x.shift(-1) -x)
user_id  day_id
0        2                                              NaN
         6                                              NaN
1        2                                  [0.0, 2.0, nan]
2        2                                              NaN
         3         [0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 6.0, nan]
                                     ...                   
150342   7                                       [0.0, nan]
150343   7                                              NaN
150344   7                                              NaN
150345   7                                              NaN
150346   7                                       [0.0, nan]
Name: minute_id, Length: 249625, dtype: object
data.groupby(['user_id','day_id'])['minute_id'].transform(lambda x :x.shift(-1) -x)
0         NaN
1         0.0
2         2.0
3         NaN
4         NaN
         ... 
463286    NaN
463287    NaN
463288    0.0
463289    NaN
463290    NaN
Name: minute_id, Length: 463291, dtype: float64
data['minute_id_diff'] = data.groupby(['user_id','day_id'])['minute_id'].transform(lambda x :x.shift(-1) -x)
data.groupby(['user_id','day_id'])['minute_id_diff'].transform('mean')
0           NaN
1           1.0
2           1.0
3           NaN
4           NaN
          ...  
463286      NaN
463287     34.0
463288    154.8
463289    154.8
463290     26.0
Name: minute_id_diff, Length: 463291, dtype: float64
data.groupby(['user_id','day_id'])['minute_id_diff'].agg('mean')
user_id  day_id
0        2              NaN
         6              NaN
1        2         1.000000
2        2              NaN
         3         1.142857
                     ...   
150342   7         0.000000
150343   7              NaN
150344   7              NaN
150345   7              NaN
150346   7         0.000000
Name: minute_id_diff, Length: 249625, dtype: float64
data.groupby(['user_id','day_id'])['minute_id_diff'].mean()
user_id  day_id
0        2              NaN
         6              NaN
1        2         1.000000
2        2              NaN
         3         1.142857
                     ...   
150342   7         0.000000
150343   7              NaN
150344   7              NaN
150345   7              NaN
150346   7         0.000000
Name: minute_id_diff, Length: 249625, dtype: float64
data['minute_id_diff_mean'] = data.groupby(['user_id','day_id'])['minute_id_diff'].transform('mean')
for col in ['user_id','product','campaign_id','webpage_id','product_category_id','user_group_id']:
    data['{}_count'.format(col)] = data.groupby(col)['minute_id'].transform('count')
    
data.columns
Index(['id', 'date', 'user_id', 'product', 'campaign_id', 'webpage_id',
       'product_category_id', 'user_group_id', 'gender', 'age_level',
       'user_depth', 'var_1', 'isClick', 'day_id', 'minute_id',
       'minute_id_diff', 'minute_id_diff_mean', 'user_id_count',
       'product_count', 'campaign_id_count', 'webpage_id_count',
       'product_category_id_count', 'user_group_id_count'],
      dtype='object')
ycol = 'isClick'
drop_list = [
    ycol,
    'id',
    'date'
]

features = [x for x in data.columns if x not in drop_list]
print("使用{} 个特征:{}".format(len(features),features))
使用20 个特征:['user_id', 'product', 'campaign_id', 'webpage_id', 'product_category_id', 'user_group_id', 'gender', 'age_level', 'user_depth', 'var_1', 'day_id', 'minute_id', 'minute_id_diff', 'minute_id_diff_mean', 'user_id_count', 'product_count', 'campaign_id_count', 'webpage_id_count', 'product_category_id_count', 'user_group_id_count']
# 下列特征转化为类别特征
categorical_feature = [
    'user_id',
    'product',
    'campaign_id',
    'webpage_id',
    'product_category_id',
    'user_group_id',
    'gender',
    'age_level',
    'user_depth',
]

for col in categorical_feature:
    data[col] = data[col].astype('category')
train = data[~data[ycol].isnull()]
test = data[data[ycol].isnull()]
train.shape
(391825, 23)
test.shape
(71466, 23)
del(data)
gc.collect()
0

五折交叉验证训练模型

NFLOD =5
random_state = 2021
KF = StratifiedKFold(n_splits = NFLOD,shuffle=True,random_state=random_state)

params_lgb = {
    'boosting':'gbdt',
    'objective':'binary',
    'metric':'auc',
    'force_row_size':True,
    'random_state':random_state,
    'learning_rate':0.03,
    'max_depth':8,
    'num_leaves':40,
    'subsamples':0.8,
    'subsample_freq':3,
    'colsample_bytree':0.8,
    'n_jobs':-1,
    'verbose':-1
}
oof_lgb = np.zeros(len(train))
predictions_lgb = np.zeros(len(test))
df_importance_list = []
# 五折交叉验证
for fold_,(trn_idx,val_idx) in enumerate(KF.split(train[features],train[ycol])):
    print('------------fold{}-----------'.format(fold_ + 1))
    trn_data = lgb.Dataset(train.iloc[trn_idx][features] , label=train.iloc[trn_idx][ycol])
    val_data = lgb.Dataset(train.iloc[val_idx][features], label=train.iloc[val_idx][ycol],reference=trn_data)
    
    clf_lgb = lgb.train(
        params = params_lgb,
        train_set = trn_data,
        valid_sets = [trn_data,val_data],
        valid_names = ('train','val'),
        num_boost_round = 50000,
        early_stopping_rounds = 200,
        verbose_eval = 100,
    )
    
    oof_lgb[val_idx] = clf_lgb.predict(train.iloc[val_idx][features],num_iteration=clf_lgb.best_iteration)
    predictions_lgb[:] += (clf_lgb.predict(test[features],num_iteration = clf_lgb.best_iteration)/ NFLOD)
    
    df_importance = pd.DataFrame({
        'column':features,
        'importance_split':clf_lgb.feature_importance(importance_type = 'split'),
        'importance_gain':clf_lgb.feature_importance(importance_type = 'gain')
    })
    
    df_importance_list.append(df_importance)
------------fold1-----------
[LightGBM] [Warning] Unknown parameter: subsamples
[LightGBM] [Warning] Unknown parameter: force_row_size


/Users/gaozhiyong/Documents/pyenv/pyenv3.6/lib/python3.6/site-packages/lightgbm/basic.py:1433: UserWarning: Overriding the parameters from Reference Dataset.
  _log_warning('Overriding the parameters from Reference Dataset.')
/Users/gaozhiyong/Documents/pyenv/pyenv3.6/lib/python3.6/site-packages/lightgbm/basic.py:1245: UserWarning: categorical_column in param dict is overridden.
  _log_warning('{} in param dict is overridden.'.format(cat_alias))


Training until validation scores don't improve for 200 rounds
[100]   train's auc: 0.677174   val's auc: 0.628599
[200]   train's auc: 0.690672   val's auc: 0.63003
[300]   train's auc: 0.702212   val's auc: 0.631905
[400]   train's auc: 0.711116   val's auc: 0.632443
[500]   train's auc: 0.7182 val's auc: 0.632929
[600]   train's auc: 0.725516   val's auc: 0.632789
Early stopping, best iteration is:
[499]   train's auc: 0.718131   val's auc: 0.632934
------------fold2-----------
[LightGBM] [Warning] Unknown parameter: subsamples
[LightGBM] [Warning] Unknown parameter: force_row_size
Training until validation scores don't improve for 200 rounds
[100]   train's auc: 0.679197   val's auc: 0.620851
[200]   train's auc: 0.693258   val's auc: 0.622213
[300]   train's auc: 0.703885   val's auc: 0.623473
[400]   train's auc: 0.714255   val's auc: 0.62404
[500]   train's auc: 0.720288   val's auc: 0.624115
[600]   train's auc: 0.726659   val's auc: 0.624428
[700]   train's auc: 0.732689   val's auc: 0.624295
[800]   train's auc: 0.73929    val's auc: 0.624619
[900]   train's auc: 0.744832   val's auc: 0.624663
[1000]  train's auc: 0.754675   val's auc: 0.62499
[1100]  train's auc: 0.759126   val's auc: 0.624757
[1200]  train's auc: 0.763615   val's auc: 0.62502
[1300]  train's auc: 0.769984   val's auc: 0.624564
Early stopping, best iteration is:
[1164]  train's auc: 0.761882   val's auc: 0.625101
------------fold3-----------
[LightGBM] [Warning] Unknown parameter: subsamples
[LightGBM] [Warning] Unknown parameter: force_row_size
Training until validation scores don't improve for 200 rounds
[100]   train's auc: 0.677937   val's auc: 0.623831
[200]   train's auc: 0.691909   val's auc: 0.6257
[300]   train's auc: 0.702329   val's auc: 0.626483
[400]   train's auc: 0.712792   val's auc: 0.626889
[500]   train's auc: 0.719476   val's auc: 0.627282
[600]   train's auc: 0.726372   val's auc: 0.627444
[700]   train's auc: 0.732891   val's auc: 0.627488
[800]   train's auc: 0.739184   val's auc: 0.627696
[900]   train's auc: 0.747272   val's auc: 0.62776
Early stopping, best iteration is:
[756]   train's auc: 0.736675   val's auc: 0.627805
------------fold4-----------
[LightGBM] [Warning] Unknown parameter: subsamples
[LightGBM] [Warning] Unknown parameter: force_row_size
Training until validation scores don't improve for 200 rounds
[100]   train's auc: 0.678244   val's auc: 0.632375
[200]   train's auc: 0.693808   val's auc: 0.633363
[300]   train's auc: 0.703073   val's auc: 0.633338
[400]   train's auc: 0.712667   val's auc: 0.633605
Early stopping, best iteration is:
[212]   train's auc: 0.695547   val's auc: 0.633654
------------fold5-----------
[LightGBM] [Warning] Unknown parameter: subsamples
[LightGBM] [Warning] Unknown parameter: force_row_size
Training until validation scores don't improve for 200 rounds
[100]   train's auc: 0.677896   val's auc: 0.627158
[200]   train's auc: 0.692087   val's auc: 0.628503
[300]   train's auc: 0.701904   val's auc: 0.62927
[400]   train's auc: 0.711159   val's auc: 0.629929
[500]   train's auc: 0.718229   val's auc: 0.629463
[600]   train's auc: 0.725839   val's auc: 0.629527
Early stopping, best iteration is:
[401]   train's auc: 0.711302   val's auc: 0.630001
valid_auc_score = roc_auc_score(train[ycol],oof_lgb)
valid_auc_score
0.629614298701527

特征重要性

df_features_importances= pd.concat(df_importance_list)
df_features_importance= df_features_importances.groupby('column').mean().reset_index()
df_features_importance
column importance_split importance_gain
0 age_level 40.2 407.363945
1 campaign_id 565.6 17419.711442
2 campaign_id_count 1049.4 5384.477741
3 day_id 1194.6 9639.523684
4 gender 54.6 253.445407
5 minute_id 3935.8 22181.027330
6 minute_id_diff 3431.4 23767.593485
7 minute_id_diff_mean 3201.4 21643.519522
8 product 489.6 6321.783218
9 product_category_id 176.8 2965.427346
10 product_category_id_count 970.4 6571.430858
11 product_count 1215.4 7685.000637
12 user_depth 151.4 900.218823
13 user_group_id 395.0 3675.279637
14 user_group_id_count 610.2 3107.873730
15 user_id 3435.2 67579.315769
16 user_id_count 2032.4 31719.678195
17 var_1 248.8 1597.959461
18 webpage_id 91.6 2045.332514
19 webpage_id_count 359.8 1744.924732
df_features_importance.sort_values('importance_gain',ascending=False)
column importance_split importance_gain
15 user_id 3435.2 67579.315769
16 user_id_count 2032.4 31719.678195
6 minute_id_diff 3431.4 23767.593485
5 minute_id 3935.8 22181.027330
7 minute_id_diff_mean 3201.4 21643.519522
1 campaign_id 565.6 17419.711442
3 day_id 1194.6 9639.523684
11 product_count 1215.4 7685.000637
10 product_category_id_count 970.4 6571.430858
8 product 489.6 6321.783218
2 campaign_id_count 1049.4 5384.477741
13 user_group_id 395.0 3675.279637
14 user_group_id_count 610.2 3107.873730
9 product_category_id 176.8 2965.427346
18 webpage_id 91.6 2045.332514
19 webpage_id_count 359.8 1744.924732
17 var_1 248.8 1597.959461
12 user_depth 151.4 900.218823
0 age_level 40.2 407.363945
4 gender 54.6 253.445407

预测

test.head()
test.columns
Index(['id', 'date', 'user_id', 'product', 'campaign_id', 'webpage_id',
       'product_category_id', 'user_group_id', 'gender', 'age_level',
       'user_depth', 'var_1', 'isClick', 'day_id', 'minute_id',
       'minute_id_diff', 'minute_id_diff_mean', 'user_id_count',
       'product_count', 'campaign_id_count', 'webpage_id_count',
       'product_category_id_count', 'user_group_id_count'],
      dtype='object')
test.loc[:,ycol] = predictions_lgb
/Users/gaozhiyong/Documents/pyenv/pyenv3.6/lib/python3.6/site-packages/pandas/core/indexing.py:1743: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)
test.loc[:,ycol]
391825    0.082708
391826    0.084275
391827    0.046645
391828    0.050657
391829    0.126505
            ...   
463286    0.079529
463287    0.041204
463288    0.046967
463289    0.048984
463290    0.078428
Name: isClick, Length: 71466, dtype: float64
test[['id',ycol]]

保存预测结果

test[['user_id','product','campaign_id',ycol]].to_csv('res.csv',index=False)

About ME

👋 读书城南,🤔 在未来面前,我们都是孩子~
  • 📙 一个热衷于探索学习新方向、新事物的智能产品经理,闲暇时间喜欢coding💻、画图🎨、音乐🎵、学习ing~
👋 Social Media
👋 加入小组~

👋 感谢打赏~