广告投放中的CTR预估模型
广告投放中的CTR预估模型
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import gc
import os
import lightgbm as lgb
数据清洗
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import gc
import os
import lightgbm as lgb
= './datasets/' data_path
= pd.read_csv(os.path.join(data_path,'train.csv')) train
= pd.read_csv(os.path.join(data_path,'test.csv')) test
特征字段 字段描述 id 用户行为id,唯一表示,无重复 date 行为时间,精确到秒 user_id 用户id product 产品 campaign_id 活动id webpage_id 网页id product_category_id 产品类型id user_group_id 用户所属群组id gender 性别 age_level 年龄等级 user_depth 用户价值深度 var_1 匿名特征 isClick 是否点击,1为点击,0为未点击
train
test
= pd.concat([train,test],ignore_index=True) data
data
'day_id'] = data['date'].apply(lambda x:int(x[3:5])) data[
'minute_id']=data['date'].apply(lambda x:int(x[-5:-3])*60 + int(x[-2:])) data[
data
特征工程
构建用户每天前后两次浏览行为之间的时间间隔及其衍生均值特征, 因为用户浏览时间往往和其是否点击具有相关性
'minute_id'].shift(-1) data[
0 0.0
1 0.0
2 0.0
3 1.0
4 1.0
...
463286 1439.0
463287 1439.0
463288 1439.0
463289 1439.0
463290 NaN
Name: minute_id, Length: 463291, dtype: float64
'user_id','day_id'])['minute_id'].apply(lambda x :x.shift(-1) -x) data.groupby([
0 NaN
1 0.0
2 2.0
3 NaN
4 NaN
...
463286 NaN
463287 NaN
463288 0.0
463289 NaN
463290 NaN
Name: minute_id, Length: 463291, dtype: float64
'user_id','day_id'])['minute_id'].agg(lambda x :x.shift(-1) -x) data.groupby([
user_id day_id
0 2 NaN
6 NaN
1 2 [0.0, 2.0, nan]
2 2 NaN
3 [0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 6.0, nan]
...
150342 7 [0.0, nan]
150343 7 NaN
150344 7 NaN
150345 7 NaN
150346 7 [0.0, nan]
Name: minute_id, Length: 249625, dtype: object
'user_id','day_id'])['minute_id'].transform(lambda x :x.shift(-1) -x) data.groupby([
0 NaN
1 0.0
2 2.0
3 NaN
4 NaN
...
463286 NaN
463287 NaN
463288 0.0
463289 NaN
463290 NaN
Name: minute_id, Length: 463291, dtype: float64
'minute_id_diff'] = data.groupby(['user_id','day_id'])['minute_id'].transform(lambda x :x.shift(-1) -x) data[
'user_id','day_id'])['minute_id_diff'].transform('mean') data.groupby([
0 NaN
1 1.0
2 1.0
3 NaN
4 NaN
...
463286 NaN
463287 34.0
463288 154.8
463289 154.8
463290 26.0
Name: minute_id_diff, Length: 463291, dtype: float64
'user_id','day_id'])['minute_id_diff'].agg('mean') data.groupby([
user_id day_id
0 2 NaN
6 NaN
1 2 1.000000
2 2 NaN
3 1.142857
...
150342 7 0.000000
150343 7 NaN
150344 7 NaN
150345 7 NaN
150346 7 0.000000
Name: minute_id_diff, Length: 249625, dtype: float64
'user_id','day_id'])['minute_id_diff'].mean() data.groupby([
user_id day_id
0 2 NaN
6 NaN
1 2 1.000000
2 2 NaN
3 1.142857
...
150342 7 0.000000
150343 7 NaN
150344 7 NaN
150345 7 NaN
150346 7 0.000000
Name: minute_id_diff, Length: 249625, dtype: float64
'minute_id_diff_mean'] = data.groupby(['user_id','day_id'])['minute_id_diff'].transform('mean') data[
for col in ['user_id','product','campaign_id','webpage_id','product_category_id','user_group_id']:
'{}_count'.format(col)] = data.groupby(col)['minute_id'].transform('count')
data[
data.columns
Index(['id', 'date', 'user_id', 'product', 'campaign_id', 'webpage_id',
'product_category_id', 'user_group_id', 'gender', 'age_level',
'user_depth', 'var_1', 'isClick', 'day_id', 'minute_id',
'minute_id_diff', 'minute_id_diff_mean', 'user_id_count',
'product_count', 'campaign_id_count', 'webpage_id_count',
'product_category_id_count', 'user_group_id_count'],
dtype='object')
= 'isClick'
ycol = [
drop_list
ycol,'id',
'date'
]
= [x for x in data.columns if x not in drop_list] features
print("使用{} 个特征:{}".format(len(features),features))
使用20 个特征:['user_id', 'product', 'campaign_id', 'webpage_id', 'product_category_id', 'user_group_id', 'gender', 'age_level', 'user_depth', 'var_1', 'day_id', 'minute_id', 'minute_id_diff', 'minute_id_diff_mean', 'user_id_count', 'product_count', 'campaign_id_count', 'webpage_id_count', 'product_category_id_count', 'user_group_id_count']
# 下列特征转化为类别特征
= [
categorical_feature 'user_id',
'product',
'campaign_id',
'webpage_id',
'product_category_id',
'user_group_id',
'gender',
'age_level',
'user_depth',
]
for col in categorical_feature:
= data[col].astype('category') data[col]
= data[~data[ycol].isnull()] train
= data[data[ycol].isnull()] test
train.shape
(391825, 23)
test.shape
(71466, 23)
del(data)
gc.collect()
0
五折交叉验证训练模型
=5
NFLOD = 2021
random_state = StratifiedKFold(n_splits = NFLOD,shuffle=True,random_state=random_state)
KF
= {
params_lgb 'boosting':'gbdt',
'objective':'binary',
'metric':'auc',
'force_row_size':True,
'random_state':random_state,
'learning_rate':0.03,
'max_depth':8,
'num_leaves':40,
'subsamples':0.8,
'subsample_freq':3,
'colsample_bytree':0.8,
'n_jobs':-1,
'verbose':-1
}
= np.zeros(len(train)) oof_lgb
= np.zeros(len(test)) predictions_lgb
= [] df_importance_list
# 五折交叉验证
for fold_,(trn_idx,val_idx) in enumerate(KF.split(train[features],train[ycol])):
print('------------fold{}-----------'.format(fold_ + 1))
= lgb.Dataset(train.iloc[trn_idx][features] , label=train.iloc[trn_idx][ycol])
trn_data = lgb.Dataset(train.iloc[val_idx][features], label=train.iloc[val_idx][ycol],reference=trn_data)
val_data
= lgb.train(
clf_lgb = params_lgb,
params = trn_data,
train_set = [trn_data,val_data],
valid_sets = ('train','val'),
valid_names = 50000,
num_boost_round = 200,
early_stopping_rounds = 100,
verbose_eval
)
= clf_lgb.predict(train.iloc[val_idx][features],num_iteration=clf_lgb.best_iteration)
oof_lgb[val_idx] += (clf_lgb.predict(test[features],num_iteration = clf_lgb.best_iteration)/ NFLOD)
predictions_lgb[:]
= pd.DataFrame({
df_importance 'column':features,
'importance_split':clf_lgb.feature_importance(importance_type = 'split'),
'importance_gain':clf_lgb.feature_importance(importance_type = 'gain')
})
df_importance_list.append(df_importance)
------------fold1-----------
[LightGBM] [Warning] Unknown parameter: subsamples
[LightGBM] [Warning] Unknown parameter: force_row_size
/Users/gaozhiyong/Documents/pyenv/pyenv3.6/lib/python3.6/site-packages/lightgbm/basic.py:1433: UserWarning: Overriding the parameters from Reference Dataset.
_log_warning('Overriding the parameters from Reference Dataset.')
/Users/gaozhiyong/Documents/pyenv/pyenv3.6/lib/python3.6/site-packages/lightgbm/basic.py:1245: UserWarning: categorical_column in param dict is overridden.
_log_warning('{} in param dict is overridden.'.format(cat_alias))
Training until validation scores don't improve for 200 rounds
[100] train's auc: 0.677174 val's auc: 0.628599
[200] train's auc: 0.690672 val's auc: 0.63003
[300] train's auc: 0.702212 val's auc: 0.631905
[400] train's auc: 0.711116 val's auc: 0.632443
[500] train's auc: 0.7182 val's auc: 0.632929
[600] train's auc: 0.725516 val's auc: 0.632789
Early stopping, best iteration is:
[499] train's auc: 0.718131 val's auc: 0.632934
------------fold2-----------
[LightGBM] [Warning] Unknown parameter: subsamples
[LightGBM] [Warning] Unknown parameter: force_row_size
Training until validation scores don't improve for 200 rounds
[100] train's auc: 0.679197 val's auc: 0.620851
[200] train's auc: 0.693258 val's auc: 0.622213
[300] train's auc: 0.703885 val's auc: 0.623473
[400] train's auc: 0.714255 val's auc: 0.62404
[500] train's auc: 0.720288 val's auc: 0.624115
[600] train's auc: 0.726659 val's auc: 0.624428
[700] train's auc: 0.732689 val's auc: 0.624295
[800] train's auc: 0.73929 val's auc: 0.624619
[900] train's auc: 0.744832 val's auc: 0.624663
[1000] train's auc: 0.754675 val's auc: 0.62499
[1100] train's auc: 0.759126 val's auc: 0.624757
[1200] train's auc: 0.763615 val's auc: 0.62502
[1300] train's auc: 0.769984 val's auc: 0.624564
Early stopping, best iteration is:
[1164] train's auc: 0.761882 val's auc: 0.625101
------------fold3-----------
[LightGBM] [Warning] Unknown parameter: subsamples
[LightGBM] [Warning] Unknown parameter: force_row_size
Training until validation scores don't improve for 200 rounds
[100] train's auc: 0.677937 val's auc: 0.623831
[200] train's auc: 0.691909 val's auc: 0.6257
[300] train's auc: 0.702329 val's auc: 0.626483
[400] train's auc: 0.712792 val's auc: 0.626889
[500] train's auc: 0.719476 val's auc: 0.627282
[600] train's auc: 0.726372 val's auc: 0.627444
[700] train's auc: 0.732891 val's auc: 0.627488
[800] train's auc: 0.739184 val's auc: 0.627696
[900] train's auc: 0.747272 val's auc: 0.62776
Early stopping, best iteration is:
[756] train's auc: 0.736675 val's auc: 0.627805
------------fold4-----------
[LightGBM] [Warning] Unknown parameter: subsamples
[LightGBM] [Warning] Unknown parameter: force_row_size
Training until validation scores don't improve for 200 rounds
[100] train's auc: 0.678244 val's auc: 0.632375
[200] train's auc: 0.693808 val's auc: 0.633363
[300] train's auc: 0.703073 val's auc: 0.633338
[400] train's auc: 0.712667 val's auc: 0.633605
Early stopping, best iteration is:
[212] train's auc: 0.695547 val's auc: 0.633654
------------fold5-----------
[LightGBM] [Warning] Unknown parameter: subsamples
[LightGBM] [Warning] Unknown parameter: force_row_size
Training until validation scores don't improve for 200 rounds
[100] train's auc: 0.677896 val's auc: 0.627158
[200] train's auc: 0.692087 val's auc: 0.628503
[300] train's auc: 0.701904 val's auc: 0.62927
[400] train's auc: 0.711159 val's auc: 0.629929
[500] train's auc: 0.718229 val's auc: 0.629463
[600] train's auc: 0.725839 val's auc: 0.629527
Early stopping, best iteration is:
[401] train's auc: 0.711302 val's auc: 0.630001
= roc_auc_score(train[ycol],oof_lgb) valid_auc_score
valid_auc_score
0.629614298701527
特征重要性
= pd.concat(df_importance_list) df_features_importances
= df_features_importances.groupby('column').mean().reset_index() df_features_importance
df_features_importance
column | importance_split | importance_gain | |
---|---|---|---|
0 | age_level | 40.2 | 407.363945 |
1 | campaign_id | 565.6 | 17419.711442 |
2 | campaign_id_count | 1049.4 | 5384.477741 |
3 | day_id | 1194.6 | 9639.523684 |
4 | gender | 54.6 | 253.445407 |
5 | minute_id | 3935.8 | 22181.027330 |
6 | minute_id_diff | 3431.4 | 23767.593485 |
7 | minute_id_diff_mean | 3201.4 | 21643.519522 |
8 | product | 489.6 | 6321.783218 |
9 | product_category_id | 176.8 | 2965.427346 |
10 | product_category_id_count | 970.4 | 6571.430858 |
11 | product_count | 1215.4 | 7685.000637 |
12 | user_depth | 151.4 | 900.218823 |
13 | user_group_id | 395.0 | 3675.279637 |
14 | user_group_id_count | 610.2 | 3107.873730 |
15 | user_id | 3435.2 | 67579.315769 |
16 | user_id_count | 2032.4 | 31719.678195 |
17 | var_1 | 248.8 | 1597.959461 |
18 | webpage_id | 91.6 | 2045.332514 |
19 | webpage_id_count | 359.8 | 1744.924732 |
'importance_gain',ascending=False) df_features_importance.sort_values(
column | importance_split | importance_gain | |
---|---|---|---|
15 | user_id | 3435.2 | 67579.315769 |
16 | user_id_count | 2032.4 | 31719.678195 |
6 | minute_id_diff | 3431.4 | 23767.593485 |
5 | minute_id | 3935.8 | 22181.027330 |
7 | minute_id_diff_mean | 3201.4 | 21643.519522 |
1 | campaign_id | 565.6 | 17419.711442 |
3 | day_id | 1194.6 | 9639.523684 |
11 | product_count | 1215.4 | 7685.000637 |
10 | product_category_id_count | 970.4 | 6571.430858 |
8 | product | 489.6 | 6321.783218 |
2 | campaign_id_count | 1049.4 | 5384.477741 |
13 | user_group_id | 395.0 | 3675.279637 |
14 | user_group_id_count | 610.2 | 3107.873730 |
9 | product_category_id | 176.8 | 2965.427346 |
18 | webpage_id | 91.6 | 2045.332514 |
19 | webpage_id_count | 359.8 | 1744.924732 |
17 | var_1 | 248.8 | 1597.959461 |
12 | user_depth | 151.4 | 900.218823 |
0 | age_level | 40.2 | 407.363945 |
4 | gender | 54.6 | 253.445407 |
预测
test.head()
test.columns
Index(['id', 'date', 'user_id', 'product', 'campaign_id', 'webpage_id',
'product_category_id', 'user_group_id', 'gender', 'age_level',
'user_depth', 'var_1', 'isClick', 'day_id', 'minute_id',
'minute_id_diff', 'minute_id_diff_mean', 'user_id_count',
'product_count', 'campaign_id_count', 'webpage_id_count',
'product_category_id_count', 'user_group_id_count'],
dtype='object')
= predictions_lgb test.loc[:,ycol]
/Users/gaozhiyong/Documents/pyenv/pyenv3.6/lib/python3.6/site-packages/pandas/core/indexing.py:1743: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
isetter(ilocs[0], value)
test.loc[:,ycol]
391825 0.082708
391826 0.084275
391827 0.046645
391828 0.050657
391829 0.126505
...
463286 0.079529
463287 0.041204
463288 0.046967
463289 0.048984
463290 0.078428
Name: isClick, Length: 71466, dtype: float64
'id',ycol]] test[[
保存预测结果
'user_id','product','campaign_id',ycol]].to_csv('res.csv',index=False) test[[
About ME
👋 读书城南,🤔 在未来面前,我们都是孩子~
- 📙 一个热衷于探索学习新方向、新事物的智能产品经理,闲暇时间喜欢coding💻、画图🎨、音乐🎵、学习ing~
👋 Social Media
🛠️ Blog: http://oceaneyes.top
⚡ PM导航: https://pmhub.oceangzy.top
☘️ CNBLOG: https://www.cnblogs.com/oceaneyes-gzy/
🌱 AI PRJ自己部署的一些算法demo: http://ai.oceangzy.top/
📫 Email: 1450136519@qq.com
💬 WeChat: OCEANGZY
💬 公众号: UncleJoker-GZY
👋 加入小组~
👋 感谢打赏~
本博客所有文章除特别声明外,均采用 CC BY-NC-SA 4.0 许可协议。转载请注明来自 OCAEN.GZY读书城南!