Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 61a4c88

Browse files
author
shixiaowen03
committed
DSIN
1 parent 90693de commit 61a4c88

7 files changed

Lines changed: 730 additions & 129 deletions

File tree

.idea/workspace.xml

Lines changed: 184 additions & 129 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
FRAC = 0.25
2+
3+
DIN_SESS_MAX_LEN = 50
4+
5+
DSIN_SESS_COUNT = 5
6+
DSIN_SESS_MAX_LEN = 10
Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
from collections import OrderedDict
2+
3+
from deepctr.input_embedding import get_inputs_list, create_singlefeat_inputdict, get_embedding_vec_list
4+
from deepctr.layers.core import DNN, PredictionLayer
5+
from deepctr.layers.sequence import AttentionSequencePoolingLayer, BiLSTM, Transformer, BiasEncoding
6+
from deepctr.layers.utils import concat_fun, NoMask
7+
from deepctr.utils import check_feature_config_dict
8+
from tensorflow.python.keras.initializers import RandomNormal
9+
from tensorflow.python.keras.layers import Input, Dense, Embedding, Concatenate, Flatten
10+
from tensorflow.python.keras.models import Model
11+
from tensorflow.python.keras.regularizers import l2
12+
13+
14+
def DSIN(feature_dim_dict, sess_feature_list, embedding_size=8, sess_max_count=5, sess_len_max=10,
15+
att_embedding_size=1, att_head_num=8, dnn_hidden_units=(200, 80), dnn_activation='sigmoid',
16+
l2_reg_dnn=0, l2_reg_embedding=1e-6, task='binary', dnn_dropout=0, init_std=0.0001, seed=1024,
17+
bias_encoding=False):
18+
# 检查一下特征字典是否正确
19+
check_feature_config_dict(feature_dim_dict)
20+
21+
22+
sparse_input, dense_input, user_behavior_input_dict, _, user_sess_length = get_input(
23+
feature_dim_dict, sess_feature_list, sess_max_count, sess_len_max)
24+
25+
sparse_embedding_dict = {feat.name: Embedding(feat.dimension, embedding_size,
26+
embeddings_initializer=RandomNormal(
27+
mean=0.0, stddev=init_std, seed=seed),
28+
embeddings_regularizer=l2(
29+
l2_reg_embedding),
30+
name='sparse_emb_' + str(i) + '-' + feat.name,
31+
mask_zero=(feat.name in sess_feature_list)) for i, feat in
32+
enumerate(feature_dim_dict["sparse"])}
33+
34+
query_emb_list = get_embedding_vec_list(sparse_embedding_dict, sparse_input, feature_dim_dict["sparse"],
35+
sess_feature_list, sess_feature_list)
36+
37+
query_emb = concat_fun(query_emb_list)
38+
39+
deep_input_emb_list = get_embedding_vec_list(sparse_embedding_dict, sparse_input, feature_dim_dict["sparse"],
40+
mask_feat_list=sess_feature_list)
41+
deep_input_emb = concat_fun(deep_input_emb_list)
42+
deep_input_emb = Flatten()(NoMask()(deep_input_emb))
43+
44+
tr_input = sess_interest_division(sparse_embedding_dict, user_behavior_input_dict, feature_dim_dict['sparse'],
45+
sess_feature_list, sess_max_count, bias_encoding=bias_encoding)
46+
47+
Self_Attention = Transformer(att_embedding_size, att_head_num, dropout_rate=0, use_layer_norm=False,
48+
use_positional_encoding=(not bias_encoding), seed=seed, supports_masking=True,
49+
blinding=True)
50+
sess_fea = sess_interest_extractor(tr_input, sess_max_count, Self_Attention)
51+
52+
interest_attention_layer = AttentionSequencePoolingLayer(att_hidden_units=(64, 16), weight_normalization=True,
53+
supports_masking=False)(
54+
[query_emb, sess_fea, user_sess_length])
55+
56+
lstm_outputs = BiLSTM(len(sess_feature_list) * embedding_size, layers=2, res_layers=0, dropout_rate=0.2, )(sess_fea)
57+
lstm_attention_layer = AttentionSequencePoolingLayer(att_hidden_units=(64, 16), weight_normalization=True)(
58+
[query_emb, lstm_outputs, user_sess_length])
59+
60+
deep_input_emb = Concatenate()(
61+
[deep_input_emb, Flatten()(interest_attention_layer), Flatten()(lstm_attention_layer)])
62+
if len(dense_input) > 0:
63+
deep_input_emb = Concatenate()([deep_input_emb] + list(dense_input.values()))
64+
65+
output = DNN(dnn_hidden_units, dnn_activation, l2_reg_dnn, dnn_dropout, False, seed)(deep_input_emb)
66+
output = Dense(1, use_bias=False, activation=None)(output)
67+
output = PredictionLayer(task)(output)
68+
69+
sess_input_list = []
70+
# sess_input_length_list = []
71+
for i in range(sess_max_count):
72+
sess_name = "sess_" + str(i)
73+
sess_input_list.extend(get_inputs_list([user_behavior_input_dict[sess_name]]))
74+
# sess_input_length_list.append(user_behavior_length_dict[sess_name])
75+
76+
model_input_list = get_inputs_list([sparse_input, dense_input]) + sess_input_list + [
77+
user_sess_length]
78+
79+
model = Model(inputs=model_input_list, outputs=output)
80+
81+
return model
82+
83+
84+
def get_input(feature_dim_dict, seq_feature_list, sess_max_count, seq_max_len):
85+
sparse_input, dense_input = create_singlefeat_inputdict(feature_dim_dict)
86+
user_behavior_input = {}
87+
for idx in range(sess_max_count):
88+
sess_input = OrderedDict()
89+
for i, feat in enumerate(seq_feature_list):
90+
sess_input[feat] = Input(shape=(seq_max_len,), name='seq_' + str(idx) + str(i) + '-' + feat)
91+
92+
user_behavior_input["sess_" + str(idx)] = sess_input
93+
94+
user_behavior_length = {"sess_" + str(idx): Input(shape=(1,), name='seq_length' + str(idx)) for idx in
95+
range(sess_max_count)}
96+
user_sess_length = Input(shape=(1,), name='sess_length')
97+
98+
return sparse_input, dense_input, user_behavior_input, user_behavior_length, user_sess_length
99+
100+
101+
def sess_interest_division(sparse_embedding_dict, user_behavior_input_dict, sparse_fg_list, sess_feture_list,
102+
sess_max_count,
103+
bias_encoding=True):
104+
tr_input = []
105+
for i in range(sess_max_count):
106+
sess_name = "sess_" + str(i)
107+
keys_emb_list = get_embedding_vec_list(sparse_embedding_dict, user_behavior_input_dict[sess_name],
108+
sparse_fg_list, sess_feture_list, sess_feture_list)
109+
# [sparse_embedding_dict[feat](user_behavior_input_dict[sess_name][feat]) for feat in
110+
# sess_feture_list]
111+
keys_emb = concat_fun(keys_emb_list)
112+
tr_input.append(keys_emb)
113+
if bias_encoding:
114+
tr_input = BiasEncoding(sess_max_count)(tr_input)
115+
return tr_input
116+
117+
118+
def sess_interest_extractor(tr_input, sess_max_count, TR):
119+
tr_out = []
120+
for i in range(sess_max_count):
121+
tr_out.append(TR(
122+
[tr_input[i], tr_input[i]]))
123+
sess_fea = concat_fun(tr_out, axis=1)
124+
return sess_fea
Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
2+
import os
3+
4+
import numpy as np
5+
import pandas as pd
6+
from deepctr.utils import SingleFeat
7+
from sklearn.preprocessing import LabelEncoder, StandardScaler
8+
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
9+
from tqdm import tqdm
10+
11+
from config import DSIN_SESS_COUNT, DSIN_SESS_MAX_LEN, FRAC
12+
13+
FRAC = FRAC
14+
SESS_COUNT = DSIN_SESS_COUNT
15+
16+
17+
def gen_sess_feature_dsin(row):
18+
sess_count = DSIN_SESS_COUNT
19+
sess_max_len = DSIN_SESS_MAX_LEN
20+
sess_input_dict = {}
21+
sess_input_length_dict = {}
22+
for i in range(sess_count):
23+
sess_input_dict['sess_' + str(i)] = {'cate_id': [], 'brand': []}
24+
sess_input_length_dict['sess_' + str(i)] = 0
25+
sess_length = 0
26+
user, time_stamp = row[1]['user'], row[1]['time_stamp']
27+
# sample_time = pd.to_datetime(timestamp_datetime(time_stamp ))
28+
if user not in user_hist_session:
29+
for i in range(sess_count):
30+
sess_input_dict['sess_' + str(i)]['cate_id'] = [0]
31+
sess_input_dict['sess_' + str(i)]['brand'] = [0]
32+
sess_input_length_dict['sess_' + str(i)] = 0
33+
sess_length = 0
34+
else:
35+
valid_sess_count = 0
36+
last_sess_idx = len(user_hist_session[user]) - 1
37+
for i in reversed(range(len(user_hist_session[user]))):
38+
cur_sess = user_hist_session[user][i]
39+
if cur_sess[0][2] < time_stamp:
40+
in_sess_count = 1
41+
for j in range(1, len(cur_sess)):
42+
if cur_sess[j][2] < time_stamp:
43+
in_sess_count += 1
44+
if in_sess_count > 2:
45+
sess_input_dict['sess_0']['cate_id'] = [e[0] for e in cur_sess[max(0,
46+
in_sess_count - sess_max_len):in_sess_count]]
47+
sess_input_dict['sess_0']['brand'] = [e[1] for e in
48+
cur_sess[max(0, in_sess_count - sess_max_len):in_sess_count]]
49+
sess_input_length_dict['sess_0'] = min(
50+
sess_max_len, in_sess_count)
51+
last_sess_idx = i
52+
valid_sess_count += 1
53+
break
54+
for i in range(1, sess_count):
55+
if last_sess_idx - i >= 0:
56+
cur_sess = user_hist_session[user][last_sess_idx - i]
57+
sess_input_dict['sess_' + str(i)]['cate_id'] = [e[0]
58+
for e in cur_sess[-sess_max_len:]]
59+
sess_input_dict['sess_' + str(i)]['brand'] = [e[1]
60+
for e in cur_sess[-sess_max_len:]]
61+
sess_input_length_dict['sess_' +
62+
str(i)] = min(sess_max_len, len(cur_sess))
63+
valid_sess_count += 1
64+
else:
65+
sess_input_dict['sess_' + str(i)]['cate_id'] = [0]
66+
sess_input_dict['sess_' + str(i)]['brand'] = [0]
67+
sess_input_length_dict['sess_' + str(i)] = 0
68+
69+
sess_length = valid_sess_count
70+
return sess_input_dict, sess_input_length_dict, sess_length
71+
72+
73+
if __name__ == "__main__":
74+
75+
user_hist_session = {}
76+
FILE_NUM = len(
77+
list(filter(lambda x: x.startswith('user_hist_session_' + str(FRAC) + '_dsin_'),
78+
os.listdir('../sampled_data/'))))
79+
80+
print('total', FILE_NUM, 'files')
81+
82+
for i in range(FILE_NUM):
83+
user_hist_session_ = pd.read_pickle(
84+
'../sampled_data/user_hist_session_' + str(FRAC) + '_dsin_' + str(i) + '.pkl') # 19,34
85+
user_hist_session.update(user_hist_session_)
86+
del user_hist_session_
87+
88+
sample_sub = pd.read_pickle(
89+
'../sampled_data/raw_sample_' + str(FRAC) + '.pkl')
90+
91+
index_list = []
92+
sess_input_dict = {}
93+
sess_input_length_dict = {}
94+
for i in range(SESS_COUNT):
95+
sess_input_dict['sess_' + str(i)] = {'cate_id': [], 'brand': []}
96+
sess_input_length_dict['sess_' + str(i)] = []
97+
98+
sess_length_list = []
99+
for row in tqdm(sample_sub[['user', 'time_stamp']].iterrows()):
100+
sess_input_dict_, sess_input_length_dict_, sess_length = gen_sess_feature_dsin(
101+
row)
102+
# index_list.append(index)
103+
for i in range(SESS_COUNT):
104+
sess_name = 'sess_' + str(i)
105+
sess_input_dict[sess_name]['cate_id'].append(
106+
sess_input_dict_[sess_name]['cate_id'])
107+
sess_input_dict[sess_name]['brand'].append(
108+
sess_input_dict_[sess_name]['brand'])
109+
sess_input_length_dict[sess_name].append(
110+
sess_input_length_dict_[sess_name])
111+
sess_length_list.append(sess_length)
112+
113+
print('done')
114+
115+
user = pd.read_pickle('../sampled_data/user_profile_' + str(FRAC) + '.pkl')
116+
ad = pd.read_pickle('../sampled_data/ad_feature_enc_' + str(FRAC) + '.pkl')
117+
user = user.fillna(-1)
118+
user.rename(
119+
columns={'new_user_class_level ': 'new_user_class_level'}, inplace=True)
120+
121+
sample_sub = pd.read_pickle(
122+
'../sampled_data/raw_sample_' + str(FRAC) + '.pkl')
123+
sample_sub.rename(columns={'user': 'userid'}, inplace=True)
124+
125+
data = pd.merge(sample_sub, user, how='left', on='userid', )
126+
data = pd.merge(data, ad, how='left', on='adgroup_id')
127+
128+
sparse_features = ['userid', 'adgroup_id', 'pid', 'cms_segid', 'cms_group_id', 'final_gender_code', 'age_level',
129+
'pvalue_level', 'shopping_level', 'occupation', 'new_user_class_level', 'campaign_id',
130+
'customer'] # sparse feature for user and ads
131+
132+
dense_features = ['price'] # dense feature for user and ads
133+
134+
for feat in tqdm(sparse_features):
135+
lbe = LabelEncoder() # or Hash
136+
data[feat] = lbe.fit_transform(data[feat]) # 将不同的取值转换为对应的编号
137+
mms = StandardScaler()
138+
data[dense_features] = mms.fit_transform(data[dense_features])
139+
140+
# class SingleFeat(namedtuple('SingleFeat', ['name', 'dimension', 'hash_flag', 'dtype'])):
141+
sparse_feature_list = [SingleFeat(feat, data[feat].nunique(
142+
) + 1) for feat in sparse_features + ['cate_id', 'brand']]
143+
dense_feature_list = [SingleFeat(feat, 1) for feat in dense_features]
144+
sess_feature = ['cate_id', 'brand'] # sess feature for ad
145+
146+
sess_input = []
147+
sess_input_length = []
148+
for i in tqdm(range(SESS_COUNT)):
149+
sess_name = 'sess_' + str(i)
150+
for feat in sess_feature:
151+
sess_input.append(pad_sequences(
152+
sess_input_dict[sess_name][feat], maxlen=SESS_COUNT, padding='post'))
153+
sess_input_length.append(sess_input_length_dict[sess_name])
154+
155+
model_input = [data[feat.name].values for feat in sparse_feature_list] + \
156+
[data[feat.name].values for feat in dense_feature_list]
157+
sess_lists = sess_input + [np.array(sess_length_list)]
158+
model_input += sess_lists
159+
160+
if not os.path.exists('../model_input/'):
161+
os.mkdir('../model_input/')
162+
163+
pd.to_pickle(model_input, '../model_input/dsin_input_' +
164+
str(FRAC) + '_' + str(SESS_COUNT) + '.pkl')
165+
pd.to_pickle(data['clk'].values, '../model_input/dsin_label_' +
166+
str(FRAC) + '_' + str(SESS_COUNT) + '.pkl')
167+
pd.to_pickle({'sparse': sparse_feature_list, 'dense': dense_feature_list},
168+
'../model_input/dsin_fd_' + str(FRAC) + '_' + str(SESS_COUNT) + '.pkl')
169+
print("gen dsin input done")
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
# coding: utf-8
2+
import os
3+
4+
import numpy as np
5+
import pandas as pd
6+
from sklearn.preprocessing import LabelEncoder
7+
8+
from config import FRAC
9+
10+
if __name__ == "__main__":
11+
12+
user = pd.read_csv('../data/user_profile.csv')
13+
sample = pd.read_csv('../data/raw_sample.csv')
14+
15+
if not os.path.exists('../sampled_data/'):
16+
os.mkdir('../sampled_data/')
17+
18+
if os.path.exists('../sampled_data/user_profile_' + str(FRAC) + '_.pkl') and os.path.exists(
19+
'../sampled_data/raw_sample_' + str(FRAC) + '_.pkl'):
20+
user_sub = pd.read_pickle(
21+
'../sampled_data/user_profile_' + str(FRAC) + '_.pkl')
22+
sample_sub = pd.read_pickle(
23+
'../sampled_data/raw_sample_' + str(FRAC) + '_.pkl')
24+
else:
25+
26+
if FRAC < 1.0:
27+
user_sub = user.sample(frac=FRAC, random_state=1024)
28+
else:
29+
user_sub = user
30+
sample_sub = sample.loc[sample.user.isin(user_sub.userid.unique())]
31+
pd.to_pickle(user_sub, '../sampled_data/user_profile_' +
32+
str(FRAC) + '.pkl')
33+
pd.to_pickle(sample_sub, '../sampled_data/raw_sample_' +
34+
str(FRAC) + '.pkl')
35+
36+
if os.path.exists('../data/behavior_log_pv.pkl'):
37+
log = pd.read_pickle('../data/behavior_log_pv.pkl')
38+
else:
39+
log = pd.read_csv('../data/behavior_log.csv')
40+
log = log.loc[log['btag'] == 'pv']
41+
pd.to_pickle(log, '../data/behavior_log_pv.pkl')
42+
43+
userset = user_sub.userid.unique()
44+
log = log.loc[log.user.isin(userset)]
45+
# pd.to_pickle(log, '../sampled_data/behavior_log_pv_user_filter_' + str(FRAC) + '_.pkl')
46+
47+
ad = pd.read_csv('../data/ad_feature.csv')
48+
ad['brand'] = ad['brand'].fillna(-1)
49+
50+
lbe = LabelEncoder()
51+
# unique_cate_id = ad['cate_id'].unique()
52+
# log = log.loc[log.cate.isin(unique_cate_id)]
53+
54+
unique_cate_id = np.concatenate(
55+
(ad['cate_id'].unique(), log['cate'].unique()))
56+
57+
lbe.fit(unique_cate_id)
58+
ad['cate_id'] = lbe.transform(ad['cate_id']) + 1
59+
log['cate'] = lbe.transform(log['cate']) + 1
60+
61+
lbe = LabelEncoder()
62+
# unique_brand = np.ad['brand'].unique()
63+
# log = log.loc[log.brand.isin(unique_brand)]
64+
65+
unique_brand = np.concatenate(
66+
(ad['brand'].unique(), log['brand'].unique()))
67+
68+
lbe.fit(unique_brand)
69+
ad['brand'] = lbe.transform(ad['brand']) + 1
70+
log['brand'] = lbe.transform(log['brand']) + 1
71+
72+
log = log.loc[log.user.isin(sample_sub.user.unique())]
73+
log.drop(columns=['btag'], inplace=True)
74+
log = log.loc[log['time_stamp'] > 0]
75+
76+
pd.to_pickle(ad, '../sampled_data/ad_feature_enc_' + str(FRAC) + '.pkl')
77+
pd.to_pickle(
78+
log, '../sampled_data/behavior_log_pv_user_filter_enc_' + str(FRAC) + '.pkl')
79+
80+
print("0_gen_sampled_data done")

0 commit comments

Comments
 (0)