diff --git a/.gitignore b/.gitignore index c1168d2..0e7ad98 100644 --- a/.gitignore +++ b/.gitignore @@ -129,9 +129,6 @@ dmypy.json .pyre/ .idea -<<<<<<< HEAD *.csv .DS_Store -======= -*.csv ->>>>>>> 734986b93a9246f05fb1b15f98977242f436de04 + diff --git a/algorithm/DIN/activations.py b/algorithm/DIN/activations.py index 29b0dea..6443c7f 100644 --- a/algorithm/DIN/activations.py +++ b/algorithm/DIN/activations.py @@ -1,42 +1,3 @@ -<<<<<<< HEAD -import tensorflow as tf - - -def prelu(x, name=""): - """ - Args: - x (tf.Tensor): 输入tensor - name (str): alpha的变量名后缀 - Returns: - 经过prelu激活后的tensor - """ - - alpha = tf.get_variable(name=f"prelu_alpha_{name}", - shape=x.shape[-1], - initializer=tf.constant_initializer(1.0), - dtype=x.dtype) - return tf.maximum(0.0, x) + alpha * tf.minimum(0.0, x) - - -def dice(x, name=""): - """ - Args: - x (tf.Tensor): 输入tensor - name (str): alpha, beta的变量名后缀 - Returns: - 经过dice激活后的tensor - """ - - alpha = tf.get_variable(name=f"dice_alpha_{name}", - shape=x.shape[-1], - initializer=tf.constant_initializer(1.0), - dtype=x.dtype) - # 利用batch_normalization的API, 无需可训练参数beta和gamma - x_norm = tf.layers.batch_normalization(x, center=False, scale=False, name=f"dice_bn_{name}") - px = tf.sigmoid(x_norm) - - return x * px + alpha * x * (1 - px) -======= import tensorflow as tf @@ -74,4 +35,3 @@ def dice(x, name=""): px = tf.sigmoid(x_norm) return x * px + alpha * x * (1 - px) ->>>>>>> 734986b93a9246f05fb1b15f98977242f436de04 diff --git a/algorithm/DIN/din.py b/algorithm/DIN/din.py index be0e445..763c837 100644 --- a/algorithm/DIN/din.py +++ b/algorithm/DIN/din.py @@ -1,369 +1,3 @@ -<<<<<<< HEAD -""" - [1] Guorui Zhou, Xiaoqiang Zhu, Chenru Song, Ying Fan, Han Zhu, Xiao Ma, Yanghui - Yan, Junqi Jin, Han Li, and Kun Gai. 2018. Deep interest network for click-through - rate prediction. In Proceedings of the 24th ACM SIGKDD International Conference - on Knowledge Discovery & Data Mining. ACM, 1059–1068. -""" - -import os -from typing import List, Tuple, Any -import pandas as pd -import tensorflow as tf -from tensorflow import feature_column as fc -from din_attention import din_attention -from activations import prelu, dice - -# 定义输入参数 -flags = tf.app.flags - -# 训练参数 -flags.DEFINE_string("model_dir", "./model_dir", "Directory where model parameters, graph, etc are saved") -flags.DEFINE_string("output_dir", "./output_dir", "Directory where pb file are saved") - -# flags.DEFINE_string("output_model", "./model_output", "Path to the training data.") -flags.DEFINE_string("train_data", "../../dataset/wechat_algo_data1/tfrecord/train.tfrecord", "Path to the train data") -flags.DEFINE_string("eval_data", "../../dataset/wechat_algo_data1/tfrecord/test.tfrecord", - "Path to the evaluation data") -flags.DEFINE_string("vocabulary_dir", "../../dataset/wechat_algo_data1/vocabulary/", - "Folder where the vocabulary file is stored") -flags.DEFINE_integer("num_epochs", 1, "Epoch of training phase") -flags.DEFINE_integer("train_steps", 10000, "Number of (global) training steps to perform") -flags.DEFINE_integer("shuffle_buffer_size", 10000, "Dataset shuffle buffer size") -flags.DEFINE_integer("num_parallel_readers", -1, "Number of parallel readers for training data") -flags.DEFINE_integer("save_checkpoints_steps", 1000, "Save checkpoints every this many steps") - -# 模型参数 -flags.DEFINE_integer("batch_size", 1024, "Training batch size") -flags.DEFINE_float("learning_rate", 0.005, "Learning rate") -flags.DEFINE_string("hidden_units", "512,256,128", - "Comma-separated list of number of units in each hidden layer of the deep part") -flags.DEFINE_boolean("batch_norm", True, "Perform batch normalization (True or False)") -flags.DEFINE_float("dropout_rate", 0.1, "Dropout rate") -flags.DEFINE_string("activation", "dice", "Dense layer activation, supported strings are in {'prelu', dice'}") -flags.DEFINE_boolean("mini_batch_aware_regularization", True, "Whether to use mini_batch_aware_regularization") -flags.DEFINE_float("l2_lambda", 0.2, "Coefficient when using mini_batch_aware_regularization") -flags.DEFINE_boolean("use_softmax", False, "Whether to use softmax on attention score") - -FLAGS = flags.FLAGS - - -def create_feature_columns() -> Tuple[list, list, list, list, list]: - """ - - Returns: - dense_feature_columns (list): 连续特征的feature_columns - category_feature_columns (list): 类别特征的feature_columns - target_feedid_feature_columns (list): 目标feed的feature_columns - sequence_feature_columns (list): 历史行为队列的feature_columns - label_feature_columns (list): 因变量的feature_columns - """ - - category_feature_columns, dense_feature_columns = [], [] - target_feedid_feature_columns, sequence_feature_columns = [], [] - label_feature_columns = [] - - # 连续特征 - videoplayseconds = fc.numeric_column('videoplayseconds', default_value=0.0) - u_read_comment_7d_sum = fc.numeric_column('u_read_comment_7d_sum', default_value=0.0) - u_like_7d_sum = fc.numeric_column('u_like_7d_sum', default_value=0.0) - u_click_avatar_7d_sum = fc.numeric_column('u_click_avatar_7d_sum', default_value=0.0) - u_forward_7d_sum = fc.numeric_column('u_forward_7d_sum', default_value=0.0) - u_comment_7d_sum = fc.numeric_column('u_comment_7d_sum', default_value=0.0) - u_follow_7d_sum = fc.numeric_column('u_follow_7d_sum', default_value=0.0) - u_favorite_7d_sum = fc.numeric_column('u_favorite_7d_sum', default_value=0.0) - - i_read_comment_7d_sum = fc.numeric_column('i_read_comment_7d_sum', default_value=0.0) - i_like_7d_sum = fc.numeric_column('i_like_7d_sum', default_value=0.0) - i_click_avatar_7d_sum = fc.numeric_column('i_click_avatar_7d_sum', default_value=0.0) - i_forward_7d_sum = fc.numeric_column('i_forward_7d_sum', default_value=0.0) - i_comment_7d_sum = fc.numeric_column('i_comment_7d_sum', default_value=0.0) - i_follow_7d_sum = fc.numeric_column('i_follow_7d_sum', default_value=0.0) - i_favorite_7d_sum = fc.numeric_column('i_favorite_7d_sum', default_value=0.0) - - c_user_author_read_comment_7d_sum = fc.numeric_column('c_user_author_read_comment_7d_sum', default_value=0.0) - - dense_feature_columns += [videoplayseconds, u_read_comment_7d_sum, u_like_7d_sum, u_click_avatar_7d_sum, - u_forward_7d_sum, u_comment_7d_sum, u_follow_7d_sum, u_favorite_7d_sum, - i_read_comment_7d_sum, i_like_7d_sum, i_click_avatar_7d_sum, i_forward_7d_sum, - i_comment_7d_sum, i_follow_7d_sum, i_favorite_7d_sum, - c_user_author_read_comment_7d_sum] - - # 类别特征 - userid = fc.categorical_column_with_vocabulary_file('userid', os.path.join(FLAGS.vocabulary_dir, 'userid.txt')) - feedid = fc.sequence_categorical_column_with_vocabulary_file('feedid', os.path.join(FLAGS.vocabulary_dir, 'feedid.txt')) - device = fc.categorical_column_with_vocabulary_file('device', os.path.join(FLAGS.vocabulary_dir, 'device.txt')) - authorid = fc.categorical_column_with_vocabulary_file('authorid', os.path.join(FLAGS.vocabulary_dir, 'authorid.txt')) - bgm_song_id = fc.categorical_column_with_vocabulary_file('bgm_song_id', os.path.join(FLAGS.vocabulary_dir, 'bgm_song_id.txt')) - bgm_singer_id = fc.categorical_column_with_vocabulary_file('bgm_singer_id', os.path.join(FLAGS.vocabulary_dir, 'bgm_singer_id.txt')) - - manual_tag_list = fc.categorical_column_with_vocabulary_file('manual_tag_list', os.path.join(FLAGS.vocabulary_dir, 'manual_tag_id.txt')) - his_read_comment_7d_seq = fc.sequence_categorical_column_with_vocabulary_file('his_read_comment_7d_seq', os.path.join(FLAGS.vocabulary_dir, 'feedid.txt')) - - userid_emb = fc.embedding_column(userid, 16) - feedid_emb = fc.shared_embedding_columns([feedid, his_read_comment_7d_seq], 16, combiner='mean') - device_emb = fc.embedding_column(device, 2) - authorid_emb = fc.embedding_column(authorid, 4) - bgm_song_id_emb = fc.embedding_column(bgm_song_id, 4) - bgm_singer_id_emb = fc.embedding_column(bgm_singer_id, 4) - manual_tag_id_emb = fc.embedding_column(manual_tag_list, 4, combiner='mean') - - category_feature_columns += [userid_emb, device_emb, authorid_emb, bgm_song_id_emb, bgm_singer_id_emb, - manual_tag_id_emb] - - target_feedid_feature_columns += [feedid_emb[0]] - sequence_feature_columns += [feedid_emb[1]] - - # label - read_comment = fc.numeric_column("read_comment", default_value=0.0) - label_feature_columns += [read_comment] - - return dense_feature_columns, category_feature_columns, target_feedid_feature_columns, sequence_feature_columns, label_feature_columns - - -def example_parser(serialized_example): - """ - 批量解析Example - Args: - serialized_example: - - Returns: - features, labels - """ - fea_columns = total_feature_columns - label_columns = label_feature_columns - - feature_spec = tf.feature_column.make_parse_example_spec(fea_columns + label_columns) - features = tf.parse_example(serialized_example, features=feature_spec) - read_comment = features.pop("read_comment") - return features, {"read_comment": read_comment} - - -def train_input_fn(filepath, example_parser, batch_size, num_epochs, shuffle_buffer_size): - """ - deepfm模型的input_fn - Args: - filepath (str): 训练集/验证集的路径 - example_parser (function): 解析example的函数 - batch_size (int): 每个batch样本大小 - num_epochs (int): 训练轮数 - shuffle_buffer_size (inr): shuffle时buffer的大小 - - Returns: - dataset - """ - - dataset = tf.data.TFRecordDataset(filepath) - if shuffle_buffer_size > 0: - dataset = dataset.shuffle(shuffle_buffer_size) - dataset = dataset.repeat(num_epochs) - dataset = dataset.batch(batch_size) - dataset = dataset.map(example_parser, num_parallel_calls=tf.data.experimental.AUTOTUNE) - dataset = dataset.prefetch(1) - - return dataset - - -def eval_input_fn(filepath, example_parser, batch_size): - """ - deepfm模型的eval阶段input_fn - Args: - filepath (str): 训练集/验证集的路径 - example_parser (function): 解析example的函数 - batch_size (int): 每个batch样本大小 - - Returns: - dataset - """ - - dataset = tf.data.TFRecordDataset(filepath) - dataset = dataset.batch(batch_size) - dataset = dataset.map(example_parser, num_parallel_calls=tf.data.experimental.AUTOTUNE) - dataset = dataset.prefetch(1) - - return dataset - - -def din_model_fn(features, labels, mode, params): - """ - deepfm模型的model_fn - Args: - features (dict): input_fn的第一个返回值, 模型输入样本特征 - labels (dict): input_fn的第二个返回值, 样本标签 - mode: tf.estimator.ModeKeys - params (dict): 模型超参数 - - Returns: - tf.estimator.EstimatorSpec - """ - - # 连续特征 - with tf.variable_scope("dense_input"): - dense_input = fc.input_layer(features, params["dense_feature_columns"]) - - # 类别特征 - with tf.variable_scope("category_input"): - category_input = fc.input_layer(features, params["category_feature_columns"]) - - # 目标feed - with tf.variable_scope("target_input"): - target_input, _ = tf.contrib.feature_column.sequence_input_layer(features, params["target_feedid_feature_columns"]) # (B, 1, H) - target_input = tf.squeeze(target_input, axis=1) # (B, H) - - # 历史行为序列 - with tf.variable_scope("his_seq_input"): - sequnence_input, sequnence_length = tf.contrib.feature_column.sequence_input_layer(features, params["sequence_feature_columns"]) # (B, T, H), (B,) - - # attention - with tf.variable_scope("attention_part"): - attention_output = din_attention(target_input, sequnence_input, sequnence_length, is_softmax=params["use_softmax"]) # (B, H) - - # concat all - concat_all = tf.concat([dense_input, category_input, target_input, attention_output], axis=-1) - - # 全连接层 - with tf.variable_scope("fcn"): - net = concat_all - for i, unit in enumerate(params["hidden_units"]): - layer_index = i + 1 - net = tf.layers.dense(net, unit, activation=None) - if params["activation"] == "dice": - net = dice(net, name=layer_index) - else: - net = prelu(net, name=layer_index) - if params["batch_norm"]: - net = tf.layers.batch_normalization(net, training=(mode == tf.estimator.ModeKeys.TRAIN)) - if "dropout_rate" in params and 0.0 < params["dropout_rate"] < 1.0: - net = tf.layers.dropout(net, params["dropout_rate"], training=(mode == tf.estimator.ModeKeys.TRAIN)) - - logit = tf.layers.dense(net, 1) - - # -----定义PREDICT阶段行为----- - prediction = tf.sigmoid(logit, name="prediction") - if mode == tf.estimator.ModeKeys.PREDICT: - predictions = { - 'probabilities': prediction - } - export_outputs = { - 'prediction': tf.estimator.export.PredictOutput(predictions) - } - return tf.estimator.EstimatorSpec(mode, predictions=predictions, export_outputs=export_outputs) - # -----定义完毕----- - - y = labels["read_comment"] - loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=y, logits=logit), name="ce_loss") - if params["mini_batch_aware_regularization"] and params["l2_lambda"] > 0: - embedding_vars = tf.concat([category_input, target_input, attention_output], axis=-1) - l2_loss = params["l2_lambda"] * tf.nn.l2_loss(embedding_vars) / tf.cast(tf.shape(embedding_vars)[0], dtype=tf.float32) - loss = tf.add_n([loss, l2_loss]) - - accuracy = tf.metrics.accuracy(labels=y, predictions=tf.to_float(tf.greater_equal(prediction, 0.5))) - auc = tf.metrics.auc(labels=y, predictions=prediction) - - # -----定义EVAL阶段行为----- - metrics = {"eval_accuracy": accuracy, "eval_auc": auc} - if mode == tf.estimator.ModeKeys.EVAL: - return tf.estimator.EstimatorSpec(mode, loss=loss, eval_metric_ops=metrics) - # -----定义完毕----- - - optimizer = tf.train.AdamOptimizer(learning_rate=params["learning_rate"], beta1=0.9, - beta2=0.999, epsilon=1e-8) - update_ops = tf.compat.v1.get_collection(tf.GraphKeys.UPDATE_OPS) - with tf.control_dependencies(update_ops): - train_op = optimizer.minimize(loss=loss, global_step=tf.train.get_global_step()) - - # -----定义TRAIN阶段行为----- - assert mode == tf.estimator.ModeKeys.TRAIN - - # tensorboard收集 - tf.summary.scalar("train_accuracy", accuracy[1]) - tf.summary.scalar("train_auc", auc[1]) - - # 训练log打印 - log_hook = tf.train.LoggingTensorHook( - { - "train_loss": loss, - "train_auc": auc[1], - "attention_weights": attention_output, - }, - every_n_iter=100 - ) - return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op, training_hooks=[log_hook]) - # -----定义完毕----- - -def main(unused_argv): - """训练入口""" - - global total_feature_columns, label_feature_columns - dense_feature_columns, category_feature_columns, target_feedid_feature_columns, sequence_feature_columns, label_feature_columns = create_feature_columns() - total_feature_columns = dense_feature_columns + category_feature_columns + target_feedid_feature_columns + sequence_feature_columns - - params = { - "dense_feature_columns": dense_feature_columns, - "category_feature_columns": category_feature_columns, - "sequence_feature_columns": sequence_feature_columns, - "target_feedid_feature_columns": target_feedid_feature_columns, - "hidden_units": FLAGS.hidden_units.split(','), - "dropout_rate": FLAGS.dropout_rate, - "batch_norm": FLAGS.batch_norm, - "learning_rate": FLAGS.learning_rate, - "activation": FLAGS.activation, - "mini_batch_aware_regularization": FLAGS.mini_batch_aware_regularization, - "l2_lambda": FLAGS.l2_lambda, - "use_softmax": FLAGS.use_softmax, - } - print(params) - - estimator = tf.estimator.Estimator( - model_fn=din_model_fn, - params=params, - config=tf.estimator.RunConfig(model_dir=FLAGS.model_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps) - ) - - train_spec = tf.estimator.TrainSpec( - input_fn=lambda: train_input_fn(filepath=FLAGS.train_data, example_parser=example_parser, - batch_size=FLAGS.batch_size, num_epochs=FLAGS.num_epochs, - shuffle_buffer_size=FLAGS.shuffle_buffer_size), - max_steps=FLAGS.train_steps - ) - - feature_spec = tf.feature_column.make_parse_example_spec(total_feature_columns) - serving_input_receiver_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn(feature_spec) - exporters = [ - tf.estimator.BestExporter( - name="best_exporter", - serving_input_receiver_fn=serving_input_receiver_fn, - exports_to_keep=5) - ] - eval_spec = tf.estimator.EvalSpec( - input_fn=lambda: eval_input_fn(filepath=FLAGS.eval_data, example_parser=example_parser, - batch_size=FLAGS.batch_size), - throttle_secs=600, - steps=None, - exporters=exporters - ) - - tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec) - - # Evaluate Metrics. - metrics = estimator.evaluate(input_fn=lambda: eval_input_fn(filepath=FLAGS.eval_data, example_parser=example_parser, - batch_size=FLAGS.batch_size)) - for key in sorted(metrics): - print('%s: %s' % (key, metrics[key])) - - results = estimator.predict(input_fn=lambda: eval_input_fn(filepath=FLAGS.eval_data, example_parser=example_parser, - batch_size=FLAGS.batch_size)) - predicts_df = pd.DataFrame.from_dict(results) - predicts_df['probabilities'] = predicts_df['probabilities'].apply(lambda x: x[0]) - test_df = pd.read_csv("../../dataset/wechat_algo_data1/dataframe/test.csv") - predicts_df['read_comment'] = test_df['read_comment'] - predicts_df.to_csv("predictions.csv") - print("after evaluate") - - -if __name__ == "__main__": - tf.logging.set_verbosity(tf.logging.INFO) -======= """ [1] Guorui Zhou, Xiaoqiang Zhu, Chenru Song, Ying Fan, Han Zhu, Xiao Ma, Yanghui Yan, Junqi Jin, Han Li, and Kun Gai. 2018. Deep interest network for click-through @@ -728,5 +362,5 @@ def main(unused_argv): if __name__ == "__main__": tf.logging.set_verbosity(tf.logging.INFO) ->>>>>>> 734986b93a9246f05fb1b15f98977242f436de04 - tf.app.run(main=main) \ No newline at end of file + tf.app.run(main=main) + \ No newline at end of file diff --git a/algorithm/DIN/din_attention.py b/algorithm/DIN/din_attention.py index 2ed2683..2299d0b 100644 --- a/algorithm/DIN/din_attention.py +++ b/algorithm/DIN/din_attention.py @@ -1,65 +1,3 @@ -<<<<<<< HEAD -import tensorflow as tf - - -def din_attention(query, keys, keys_length, is_softmax=False): - """ - 实现DIN模型中的attention模块 - Args: - query (tf.Tensor): 目标 shape=(B, H) - keys (tf.Tensor): 历史行为序列, shape=(B, T, H) - keys_length (tf.Tensor): 历史行为队列长度, 目的为生成mask, shape=(B, ) - is_softmax (bool): attention权重是否使用softmax激活 - - Returns: - tf.Tensor, weighted sum pooling结果 - """ - - embedding_dim = query.shape[-1].value - query = tf.tile(query, multiples=[1, tf.shape(keys)[1]]) # (B, H*T) - query = tf.reshape(query, shape=(-1, tf.shape(keys)[1], embedding_dim)) # (B, T, H) - cross_all = tf.concat([query, keys, query - keys, query * keys], axis=-1) # (B, T, 4*H) - d_layer_1_all = tf.layers.dense(cross_all, 64, activation=tf.nn.relu, name='f1_att', reuse=tf.AUTO_REUSE) # (B, T, 64) - d_layer_2_all = tf.layers.dense(d_layer_1_all, 32, activation=tf.nn.relu, name='f2_att', reuse=tf.AUTO_REUSE) # (B, T, 32) - d_layer_3_all = tf.layers.dense(d_layer_2_all, 1, activation=None, name='f3_att', reuse=tf.AUTO_REUSE) # (B, T, 1) - output_weight = d_layer_3_all # (B, T, 1) - - # mask - keys_mask = tf.sequence_mask(keys_length, tf.shape(keys)[1]) # (B, T) - keys_mask = tf.expand_dims(keys_mask, -1) # 与output_weight对齐, (B, T, 1) - - if is_softmax: - paddings = tf.ones_like(output_weight) * (-2 ** 32 + 1) # (B, T, 1) - output_weight = tf.where(keys_mask, output_weight, paddings) # (B, T, 1) - # scale, 防止梯度消失 - output_weight = output_weight / (embedding_dim ** 0.5) # (B, T, 1) - output_weight = tf.nn.softmax(output_weight, axis=1) # (B, T, 1) - else: # 按论文原文, 不使用softmax激活 - output_weight = tf.cast(keys_mask, tf.float32) # (B, T, 1) - - outputs = tf.matmul(output_weight, keys, transpose_a=True) # (B, 1, T) * (B, T, H) = (B, 1, H) - outputs = tf.squeeze(outputs, 1) # (B, H) - - return outputs - - -if __name__ == "__main__": - # Test - # B=2, T=3, H=4 - # fake_keys = tf.zeros(shape=(2, 3, 4)) - fake_keys = tf.random_normal(shape=(2, 3, 4)) - fake_query = tf.random_normal(shape=(2, 4)) - fake_keys_length = tf.constant([0, 1], 3) - attention_out1 = din_attention(fake_query, fake_keys, fake_keys_length, is_softmax=False) - attention_out2 = din_attention(fake_query, fake_keys, fake_keys_length, is_softmax=True) - - with tf.Session() as sess: - sess.run(tf.global_variables_initializer()) - print("不使用softmax激活:") - print(sess.run(attention_out1)) - print("使用softmax激活:") - print(sess.run(attention_out2)) -======= import tensorflow as tf @@ -119,5 +57,4 @@ def din_attention(query, keys, keys_length, is_softmax=False): print("不使用softmax激活:") print(sess.run(attention_out1)) print("使用softmax激活:") - print(sess.run(attention_out2)) ->>>>>>> 734986b93a9246f05fb1b15f98977242f436de04 + print(sess.run(attention_out2)) \ No newline at end of file diff --git a/algorithm/DIN/result.md b/algorithm/DIN/result.md index 9c5372b..a226158 100644 --- a/algorithm/DIN/result.md +++ b/algorithm/DIN/result.md @@ -1,16 +1,4 @@ -<<<<<<< HEAD - -| 试验序号 | activation | mini_batch_aware_regularization | l2_lambda | use_softmax |eval_auc -| ------- | ------- | ------- | ------- | ------- |------- | -| 1 |dice |True|0.2|False|0.90204114 -| 2 |prelu |True|0.2|False|0.9070767 -| 3 |dice |False|0.2|False|0.9115021 -| 4 |prelu |False|0.2|False|0.91133076 -| 5 |dice |True|0.2|True|0.90439874 -| 6 |prelu |True|0.2|True|0.9038621 -| 3 |dice |False|0.2|True|0.9116896 -| 4 |prelu |False|0.2|True|0.9108566 -======= + | 试验序号 | activation | mini_batch_aware_regularization | l2_lambda | use_softmax |eval_auc | ------- | ------- | ------- | ------- | ------- |------- | @@ -22,4 +10,3 @@ | 6 |prelu |True|0.2|True|0.9038621 | 7 |dice |False|0.2|True|0.9116896 | 8 |prelu |False|0.2|True|0.9108566 ->>>>>>> 734986b93a9246f05fb1b15f98977242f436de04 diff --git a/algorithm/DeepFM/deepfm.py b/algorithm/DeepFM/deepfm.py index 8485add..1d8884e 100644 --- a/algorithm/DeepFM/deepfm.py +++ b/algorithm/DeepFM/deepfm.py @@ -1,348 +1,3 @@ -<<<<<<< HEAD -""" - [1] Guo, Huifeng, et al. "DeepFM: a factorization-machine based neural network for CTR prediction." arXiv preprint arXiv:1703.04247 (2017). - - [2] Rendle, S. (2010, December). Factorization machines. In 2010 IEEE International Conference on Data Mining (pp. 995-1000). IEEE -""" - -import os -from typing import List, Tuple, Any -import pandas as pd -import tensorflow as tf -from tensorflow import feature_column as fc - -# 定义输入参数 -flags = tf.app.flags - -# 训练参数 -flags.DEFINE_string("model_dir", "./model_dir", "Directory where model parameters, graph, etc are saved") -flags.DEFINE_string("output_dir", "./output_dir", "Directory where pb file are saved") - -# flags.DEFINE_string("output_model", "./model_output", "Path to the training data.") -flags.DEFINE_string("train_data", "../../dataset/wechat_algo_data1/tfrecord/train.tfrecord", "Path to the train data") -flags.DEFINE_string("eval_data", "../../dataset/wechat_algo_data1/tfrecord/test.tfrecord", - "Path to the evaluation data") -flags.DEFINE_string("vocabulary_dir", "../../dataset/wechat_algo_data1/vocabulary/", - "Folder where the vocabulary file is stored") -flags.DEFINE_integer("num_epochs", 1, "Epoch of training phase") -flags.DEFINE_integer("train_steps", 10000, "Number of (global) training steps to perform") -flags.DEFINE_integer("shuffle_buffer_size", 10000, "Dataset shuffle buffer size") -flags.DEFINE_integer("num_parallel_readers", -1, "Number of parallel readers for training data") -flags.DEFINE_integer("save_checkpoints_steps", 1000, "Save checkpoints every this many steps") - -# 模型参数 -flags.DEFINE_integer("batch_size", 1024, "Training batch size") -flags.DEFINE_float("learning_rate", 0.005, "Learning rate") -flags.DEFINE_integer("embedding_dim", 8, "Embedding dimension") -flags.DEFINE_string("hidden_units", "512,256,128", - "Comma-separated list of number of units in each hidden layer of the deep part") -flags.DEFINE_boolean("batch_norm", True, "Perform batch normalization (True or False)") -flags.DEFINE_float("dropout_rate", 0.1, "Dropout rate") - -FLAGS = flags.FLAGS - - -def create_feature_columns() -> Tuple[list, list, list]: - """ - - Returns: - first_order_feature_columns (list): fm部分一阶特征的feature_columns - second_order_feature_columns (list): fm部分二阶特征的feature_columns - label_feature_columns (list): label的feature_columns - """ - - first_order_feature_columns, second_order_feature_columns, label_feature_columns = [], [], [] - - # 类别特征 - userid = fc.categorical_column_with_vocabulary_file('userid', os.path.join(FLAGS.vocabulary_dir, 'userid.txt')) - feedid = fc.categorical_column_with_vocabulary_file('feedid', os.path.join(FLAGS.vocabulary_dir, 'feedid.txt')) - device = fc.categorical_column_with_vocabulary_file('device', os.path.join(FLAGS.vocabulary_dir, 'device.txt')) - authorid = fc.categorical_column_with_vocabulary_file('authorid', - os.path.join(FLAGS.vocabulary_dir, 'authorid.txt')) - bgm_song_id = fc.categorical_column_with_vocabulary_file('bgm_song_id', - os.path.join(FLAGS.vocabulary_dir, 'bgm_song_id.txt')) - bgm_singer_id = fc.categorical_column_with_vocabulary_file('bgm_singer_id', - os.path.join(FLAGS.vocabulary_dir, 'bgm_singer_id.txt')) - - # manual_tag_list = fc.categorical_column_with_vocabulary_file('manual_tag_list', - # os.path.join(FLAGS.vocabulary_dir, 'manual_tag_id.txt')) - # his_read_comment_7d_seq = fc.categorical_column_with_vocabulary_file('his_read_comment_7d_seq', - # os.path.join(FLAGS.vocabulary_dir, 'feedid.txt')) - - # FM一阶特征 - userid_one_hot = fc.indicator_column(userid) - feedid_one_hot = fc.indicator_column(feedid) - device_one_hot = fc.indicator_column(device) - authorid_one_hot = fc.indicator_column(authorid) - bgm_song_id_one_hot = fc.indicator_column(bgm_song_id) - bgm_singer_id_one_hot = fc.indicator_column(bgm_singer_id) - - first_order_feature_columns += [userid_one_hot, feedid_one_hot, device_one_hot, authorid_one_hot, - bgm_song_id_one_hot, bgm_singer_id_one_hot] - - # FM二阶特征&deep部分特征 - userid_emb = fc.embedding_column(userid, FLAGS.embedding_dim) - feedid_emb = fc.embedding_column(feedid, FLAGS.embedding_dim) - # feedid_emb = fc.shared_embedding_columns([feedid, his_read_comment_7d_seq], 16, combiner='mean') - device_emb = fc.embedding_column(device, FLAGS.embedding_dim) - authorid_emb = fc.embedding_column(authorid, FLAGS.embedding_dim) - bgm_song_id_emb = fc.embedding_column(bgm_song_id, FLAGS.embedding_dim) - bgm_singer_id_emb = fc.embedding_column(bgm_singer_id, FLAGS.embedding_dim) - # manual_tag_id_emb = fc.embedding_column(manual_tag_list, 4, combiner='mean') - - second_order_feature_columns += [userid_emb, feedid_emb, device_emb, authorid_emb, bgm_song_id_emb, - bgm_singer_id_emb] - - # label - read_comment = fc.numeric_column("read_comment", default_value=0.0) - label_feature_columns += [read_comment] - - return first_order_feature_columns, second_order_feature_columns, label_feature_columns - - -def example_parser(serialized_example): - """ - 批量解析Example - Args: - serialized_example: - - Returns: - features, labels - """ - fea_columns = total_feature_columns - label_columns = label_feature_columns - - feature_spec = tf.feature_column.make_parse_example_spec(fea_columns + label_columns) - features = tf.parse_example(serialized_example, features=feature_spec) - read_comment = features.pop("read_comment") - return features, {"read_comment": read_comment} - - -def train_input_fn(filepath, example_parser, batch_size, num_epochs, shuffle_buffer_size): - """ - deepfm模型的input_fn - Args: - filepath (str): 训练集/验证集的路径 - example_parser (function): 解析example的函数 - batch_size (int): 每个batch样本大小 - num_epochs (int): 训练轮数 - shuffle_buffer_size (inr): shuffle时buffer的大小 - - Returns: - dataset - """ - - dataset = tf.data.TFRecordDataset(filepath) - if shuffle_buffer_size > 0: - dataset = dataset.shuffle(shuffle_buffer_size) - dataset = dataset.repeat(num_epochs) - dataset = dataset.batch(batch_size) - dataset = dataset.map(example_parser, num_parallel_calls=tf.data.experimental.AUTOTUNE) - dataset = dataset.prefetch(1) - - return dataset - - -def eval_input_fn(filepath, example_parser, batch_size): - """ - deepfm模型的eval阶段input_fn - Args: - filepath (str): 训练集/验证集的路径 - example_parser (function): 解析example的函数 - batch_size (int): 每个batch样本大小 - - Returns: - dataset - """ - - dataset = tf.data.TFRecordDataset(filepath) - dataset = dataset.batch(batch_size) - dataset = dataset.map(example_parser, num_parallel_calls=tf.data.experimental.AUTOTUNE) - dataset = dataset.prefetch(1) - - return dataset - - -def deepfm_model_fn(features, labels, mode, params): - """ - deepfm模型的model_fn - Args: - features (dict): input_fn的第一个返回值, 模型输入样本特征 - labels (dict): input_fn的第二个返回值, 样本标签 - mode: tf.estimator.ModeKeys - params (dict): 模型超参数 - - Returns: - tf.estimator.EstimatorSpec - """ - - # fm一阶部分 - with tf.variable_scope("fm_first_order"): - fm_first_order_input = fc.input_layer(features, params["first_order_feature_columns"]) - fm_first_order_logit = tf.layers.dense(fm_first_order_input, 1, name="fm_first_order_dense") # [batch, 1] - - # 将每个类别特征的embedding取出 - fields_embeddings = [] - # 将每个类别特征的embedding取出, 再做element-wise的平方操作 - fields_squared_embeddings = [] - for cat_feature_column in params["second_order_feature_columns"]: - embed_input = fc.input_layer(features, [cat_feature_column]) # (batch, K) - fields_embeddings.append(embed_input) - fields_squared_embeddings.append(tf.square(embed_input)) - # fm二阶部分 - with tf.variable_scope('fm_second_order'): - # 先加再element-wise平方, 对应FM化简公式的第一项被减数 - sum_embedding_then_square = tf.square(tf.add_n(fields_embeddings)) # [batch, K] - # 先element-wise平方再加, 对应FM化简公式的第二项减数 - square_embedding_then_sum = tf.add_n(fields_squared_embeddings) # [batch, K] - - fm_second_order_logit = tf.reduce_sum(0.5 * (sum_embedding_then_square - square_embedding_then_sum), - axis=1, - keepdims=True) # [batch, 1] - - # deep部分 - with tf.variable_scope('fm_deep'): - deep_input = tf.concat(fields_embeddings, axis=1) # (batch, F*K) - net = deep_input - for unit in params["hidden_units"]: - net = tf.layers.dense(net, unit, activation=tf.nn.relu) - if "dropout_rate" in params and 0.0 < params["dropout_rate"] < 1.0: - net = tf.layers.dropout(net, params["dropout_rate"], training=(mode == tf.estimator.ModeKeys.TRAIN)) - if params["batch_norm"]: - net = tf.layers.batch_normalization(net, training=(mode == tf.estimator.ModeKeys.TRAIN)) - deep_logit = tf.layers.dense(net, 1) # [batch, 1] - - total_logit = tf.add_n([fm_first_order_logit, fm_second_order_logit, deep_logit]) # [batch, 1] - - # -----定义PREDICT阶段行为----- - prediction = tf.sigmoid(total_logit, name="prediction") - if mode == tf.estimator.ModeKeys.PREDICT: - predictions = { - 'probabilities': prediction, - 'fm_first_order_logit': fm_first_order_logit, - 'fm_second_order_logit': fm_second_order_logit, - 'deep_logit': deep_logit, - # 'deep_input': deep_input, - # 'deep_part_final_output': net - - } - export_outputs = { - 'prediction': tf.estimator.export.PredictOutput(predictions) - } - return tf.estimator.EstimatorSpec(mode, predictions=predictions, export_outputs=export_outputs) - # -----定义完毕----- - - y = labels["read_comment"] - loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=y, logits=total_logit), name="loss") - - accuracy = tf.metrics.accuracy(labels=y, predictions=tf.to_float(tf.greater_equal(prediction, 0.5))) - auc = tf.metrics.auc(labels=y, predictions=prediction) - - # -----定义EVAL阶段行为----- - metrics = {"eval_accuracy": accuracy, "eval_auc": auc} - if mode == tf.estimator.ModeKeys.EVAL: - return tf.estimator.EstimatorSpec(mode, loss=loss, eval_metric_ops=metrics) - # -----定义完毕----- - - optimizer = tf.train.AdamOptimizer(learning_rate=params["learning_rate"], beta1=0.9, - beta2=0.999, epsilon=1e-8) - update_ops = tf.compat.v1.get_collection(tf.GraphKeys.UPDATE_OPS) - with tf.control_dependencies(update_ops): - train_op = optimizer.minimize(loss=loss, global_step=tf.train.get_global_step()) - - # -----定义TRAIN阶段行为----- - assert mode == tf.estimator.ModeKeys.TRAIN - - # tensorboard收集 - tf.summary.scalar("train_accuracy", accuracy[1]) - tf.summary.scalar("train_auc", auc[1]) - - # 训练log打印 - log_hook = tf.train.LoggingTensorHook( - { - "train_loss": loss, - "train_auc": auc[1], - # "fm_first_order_logit": fm_first_order_logit, - # "fm_second_order_logit": fm_second_order_logit, - "deep_logit": deep_logit, - 'deep_input': deep_input, - 'deep_part_final_output': net - }, - every_n_iter=100 - ) - return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op, training_hooks=[log_hook]) - # -----定义完毕----- - - -def main(unused_argv): - """训练入口""" - - global total_feature_columns, label_feature_columns - first_order_feature_columns, second_order_feature_columns, label_feature_columns = create_feature_columns() - total_feature_columns = first_order_feature_columns + second_order_feature_columns #+ label_feature_columns - - params = { - "first_order_feature_columns": first_order_feature_columns, - "second_order_feature_columns": second_order_feature_columns, - 'hidden_units': FLAGS.hidden_units.split(','), - "dropout_rate": FLAGS.dropout_rate, - "batch_norm": FLAGS.batch_norm, - "learning_rate": FLAGS.learning_rate, - } - print(params) - print(FLAGS.embedding_dim, FLAGS.num_epochs) - - estimator = tf.estimator.Estimator( - model_fn=deepfm_model_fn, - params=params, - config=tf.estimator.RunConfig(model_dir=FLAGS.model_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps) - ) - - train_spec = tf.estimator.TrainSpec( - input_fn=lambda: train_input_fn(filepath=FLAGS.train_data, example_parser=example_parser, - batch_size=FLAGS.batch_size, num_epochs=FLAGS.num_epochs, - shuffle_buffer_size=FLAGS.shuffle_buffer_size), - max_steps=FLAGS.train_steps - ) - - feature_spec = tf.feature_column.make_parse_example_spec(total_feature_columns) - serving_input_receiver_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn(feature_spec) - exporters = [ - tf.estimator.BestExporter( - name="best_exporter", - serving_input_receiver_fn=serving_input_receiver_fn, - exports_to_keep=5) - ] - eval_spec = tf.estimator.EvalSpec( - input_fn=lambda: eval_input_fn(filepath=FLAGS.eval_data, example_parser=example_parser, - batch_size=FLAGS.batch_size), - throttle_secs=600, - steps=None, - exporters=exporters - ) - - tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec) - - # Evaluate Metrics. - metrics = estimator.evaluate(input_fn=lambda: eval_input_fn(filepath=FLAGS.eval_data, example_parser=example_parser, - batch_size=FLAGS.batch_size)) - for key in sorted(metrics): - print('%s: %s' % (key, metrics[key])) - - results = estimator.predict(input_fn=lambda: eval_input_fn(filepath=FLAGS.eval_data, example_parser=example_parser, - batch_size=FLAGS.batch_size)) - predicts_df = pd.DataFrame.from_dict(results) - predicts_df['probabilities'] = predicts_df['probabilities'].apply(lambda x: x[0]) - test_df = pd.read_csv("../../dataset/wechat_algo_data1/dataframe/test.csv") - predicts_df['read_comment'] = test_df['read_comment'] - predicts_df.to_csv("predictions.csv") - print("after evaluate") - - -if __name__ == "__main__": - tf.logging.set_verbosity(tf.logging.INFO) - tf.app.run(main=main) -======= """ [1] Guo, Huifeng, et al. "DeepFM: a factorization-machine based neural network for CTR prediction." arXiv preprint arXiv:1703.04247 (2017). @@ -686,4 +341,3 @@ def main(unused_argv): if __name__ == "__main__": tf.logging.set_verbosity(tf.logging.INFO) tf.app.run(main=main) ->>>>>>> 734986b93a9246f05fb1b15f98977242f436de04 diff --git a/algorithm/DeepFM/result.md b/algorithm/DeepFM/result.md index 726f616..b6a0083 100644 --- a/algorithm/DeepFM/result.md +++ b/algorithm/DeepFM/result.md @@ -1,16 +1,3 @@ -<<<<<<< HEAD - -| 试验序号 | epoch|learning_rate | embedding_dim | batch_norm | dropout_rate |eval_auc -| ------- | ------- | ------- | ------- | ------- |------- |------- | -| 1 |1| 0.005 |8|True|0.1|0.8113984 -| 2 |2| 0.005 |8|True|0.1|0.85092986 -| 3 |2| 0.005 |16|True|0.1| 0.8529998 - - - - - -======= | 试验序号 | epoch|learning_rate | embedding_dim | batch_norm | dropout_rate |eval_auc | ------- | ------- | ------- | ------- | ------- |------- |------- | @@ -18,8 +5,3 @@ | 2 |2| 0.005 |8|True|0.1|0.85092986 | 3 |2| 0.005 |16|True|0.1| 0.8529998 - - - - ->>>>>>> 734986b93a9246f05fb1b15f98977242f436de04 diff --git a/algorithm/FM/FM.py b/algorithm/FM/FM.py deleted file mode 100644 index 0ba443e..0000000 --- a/algorithm/FM/FM.py +++ /dev/null @@ -1,14 +0,0 @@ -import os -print(os.getcwd()) - -def foo(a:int = 1, b:int = 0) -> str : - """ - - Args: - a int: - a: - b: - - Returns: - - """ \ No newline at end of file diff --git a/algorithm/FM/__init__.py b/algorithm/FM/__init__.py deleted file mode 100644 index e69de29..0000000